in src/Lucene.Net.Benchmark/Support/TagSoup/HTMLScanner.cs [395:684]
public virtual void Scan(TextReader r, IScanHandler h)
{
theState = S_PCDATA;
int firstChar = r.Peek(); // Remove any leading BOM
if (firstChar == '\uFEFF') r.Read();
while (theState != S_DONE)
{
int ch = r.Peek();
bool unread = false;
// Process control characters
if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch - 0x80];
if (ch == '\r')
{
r.Read();
ch = r.Peek(); // expect LF next
if (ch != '\n')
{
unread = true;
ch = '\n';
}
}
if (ch == '\n')
{
theCurrentLine++;
theCurrentColumn = 0;
}
else
{
theCurrentColumn++;
}
if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
// Search state table
int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2;
int statetableRow = statetableIndex[theState][adjCh + 2];
int action = 0;
if (statetableRow != -1)
{
action = statetable[statetableRow + 2];
theNextState = statetable[statetableRow + 3];
}
// System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
switch (action)
{
case 0:
throw Error.Create(
"HTMLScanner can't cope with " + ch + " in state " +
theState);
case A_ADUP:
h.Adup(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_ADUP_SAVE:
h.Adup(theOutputBuffer, 0, theSize);
theSize = 0;
Save(ch, h);
break;
case A_ADUP_STAGC:
h.Adup(theOutputBuffer, 0, theSize);
theSize = 0;
h.STagC(theOutputBuffer, 0, theSize);
break;
case A_ANAME:
h.Aname(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_ANAME_ADUP:
h.Aname(theOutputBuffer, 0, theSize);
theSize = 0;
h.Adup(theOutputBuffer, 0, theSize);
break;
case A_ANAME_ADUP_STAGC:
h.Aname(theOutputBuffer, 0, theSize);
theSize = 0;
h.Adup(theOutputBuffer, 0, theSize);
h.STagC(theOutputBuffer, 0, theSize);
break;
case A_AVAL:
h.Aval(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_AVAL_STAGC:
h.Aval(theOutputBuffer, 0, theSize);
theSize = 0;
h.STagC(theOutputBuffer, 0, theSize);
break;
case A_CDATA:
Mark();
// suppress the final "]]" in the buffer
if (theSize > 1) theSize -= 2;
h.PCDATA(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_ENTITY_START:
h.PCDATA(theOutputBuffer, 0, theSize);
theSize = 0;
Save(ch, h);
break;
case A_ENTITY:
Mark();
char ch1 = (char)ch;
// System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
if (theState == S_ENT && ch1 == '#')
{
theNextState = S_NCR;
Save(ch, h);
break;
}
else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X'))
{
theNextState = S_XNCR;
Save(ch, h);
break;
}
else if (theState == S_ENT && char.IsLetterOrDigit(ch1))
{
Save(ch, h);
break;
}
else if (theState == S_NCR && char.IsDigit(ch1))
{
Save(ch, h);
break;
}
else if (theState == S_XNCR && (char.IsDigit(ch1) || "abcdefABCDEF".IndexOf(ch1) != -1))
{
Save(ch, h);
break;
}
// The whole entity reference has been collected
// System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
h.Entity(theOutputBuffer, 1, theSize - 1);
int ent = h.GetEntity();
// System.err.println("%% value = " + ent);
if (ent != 0)
{
theSize = 0;
if (ent >= 0x80 && ent <= 0x9F)
{
ent = theWinMap[ent - 0x80];
}
if (ent < 0x20)
{
// Control becomes space
//ent = 0x20; // LUCENENET: IDE0059: Remove unnecessary value assignment
}
else if (ent >= 0xD800 && ent <= 0xDFFF)
{
// Surrogates get dropped
//ent = 0; // LUCENENET: IDE0059: Remove unnecessary value assignment
}
else if (ent <= 0xFFFF)
{
// BMP character
Save(ent, h);
}
else
{
// Astral converted to two surrogates
ent -= 0x10000;
Save((ent >> 10) + 0xD800, h);
Save((ent & 0x3FF) + 0xDC00, h);
}
if (ch != ';')
{
unread = true;
theCurrentColumn--;
}
}
else
{
unread = true;
theCurrentColumn--;
}
theNextState = S_PCDATA;
break;
case A_ETAG:
h.ETag(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_DECL:
h.Decl(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_GI:
h.GI(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_GI_STAGC:
h.GI(theOutputBuffer, 0, theSize);
theSize = 0;
h.STagC(theOutputBuffer, 0, theSize);
break;
case A_LT:
Mark();
Save('<', h);
Save(ch, h);
break;
case A_LT_PCDATA:
Mark();
Save('<', h);
h.PCDATA(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_PCDATA:
Mark();
h.PCDATA(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_CMNT:
Mark();
h.Cmnt(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_MINUS3:
Save('-', h);
Save(' ', h);
break;
case A_MINUS2:
Save('-', h);
Save(' ', h);
Save('-', h);
Save(ch, h);
// fall through into A_MINUS
break;
case A_MINUS:
Save('-', h);
Save(ch, h);
break;
case A_PI:
Mark();
h.PI(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_PITARGET:
h.PITarget(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_PITARGET_PI:
h.PITarget(theOutputBuffer, 0, theSize);
theSize = 0;
h.PI(theOutputBuffer, 0, theSize);
break;
case A_SAVE:
Save(ch, h);
break;
case A_SKIP:
break;
case A_SP:
Save(' ', h);
break;
case A_STAGC:
h.STagC(theOutputBuffer, 0, theSize);
theSize = 0;
break;
case A_EMPTYTAG:
Mark();
// System.err.println("%%% Empty tag seen");
if (theSize > 0) h.GI(theOutputBuffer, 0, theSize);
theSize = 0;
h.STagE(theOutputBuffer, 0, theSize);
break;
case A_UNGET:
unread = true;
theCurrentColumn--;
break;
case A_UNSAVE_PCDATA:
if (theSize > 0) theSize--;
h.PCDATA(theOutputBuffer, 0, theSize);
theSize = 0;
break;
default:
throw Error.Create("Can't process state " + action);
}
if (!unread)
{
r.Read();
}
theState = theNextState;
}
h.EOF(theOutputBuffer, 0, 0);
}