public class HTMLScanner extends java.lang.Object implements HTMLComponent
This component recognizes the following features:
This component recognizes the following properties:
HTMLElements
,
HTMLEntities
Modifier and Type | Class and Description |
---|---|
class |
HTMLScanner.ContentScanner
The primary HTML document scanner.
|
static class |
HTMLScanner.CurrentEntity
Current entity.
|
protected static class |
HTMLScanner.LocationItem
Location infoset item.
|
static class |
HTMLScanner.PlaybackInputStream
A playback input stream.
|
static interface |
HTMLScanner.Scanner
Basic scanner interface.
|
class |
HTMLScanner.SpecialScanner
Special scanner used for elements whose content needs to be scanned
as plain text, ignoring markup such as elements and entity references.
|
Modifier and Type | Field and Description |
---|---|
static java.lang.String |
ALLOW_SELFCLOSING_IFRAME
Allows self closing <iframe/> tag
|
static java.lang.String |
ALLOW_SELFCLOSING_TAGS
Allows self closing tags e.g.
|
protected static java.lang.String |
AUGMENTATIONS
Include infoset augmentations.
|
static java.lang.String |
CDATA_SECTIONS
Scan CDATA sections.
|
protected static boolean |
DEBUG_CALLBACKS
Set to true to debug callbacks.
|
protected static int |
DEFAULT_BUFFER_SIZE
Default buffer size.
|
protected static java.lang.String |
DEFAULT_ENCODING
Default encoding.
|
protected static java.lang.String |
DOCTYPE_PUBID
Doctype declaration public identifier.
|
protected static java.lang.String |
DOCTYPE_SYSID
Doctype declaration system identifier.
|
protected static java.lang.String |
ERROR_REPORTER
Error reporter.
|
protected boolean |
fAllowSelfclosingIframe
Allows self closing iframe tags.
|
protected boolean |
fAllowSelfclosingTags
Allows self closing tags.
|
protected boolean |
fAugmentations
Augmentations.
|
protected int |
fBeginCharacterOffset
Beginning character offset in the file.
|
protected int |
fBeginColumnNumber
Beginning column number.
|
protected int |
fBeginLineNumber
Beginning line number.
|
protected HTMLScanner.PlaybackInputStream |
fByteStream
The playback byte stream.
|
protected boolean |
fCDATASections
CDATA sections.
|
protected HTMLScanner.Scanner |
fContentScanner
Content scanner.
|
protected HTMLScanner.CurrentEntity |
fCurrentEntity
Current entity.
|
protected java.util.Stack |
fCurrentEntityStack
The current entity stack.
|
protected java.lang.String |
fDefaultIANAEncoding
Default encoding.
|
protected java.lang.String |
fDoctypePubid
Doctype declaration public identifier.
|
protected java.lang.String |
fDoctypeSysid
Doctype declaration system identifier.
|
protected XMLDocumentHandler |
fDocumentHandler
The document handler.
|
protected int |
fElementCount
Element count.
|
protected int |
fElementDepth
Element depth.
|
protected int |
fEndCharacterOffset
Ending character offset in the file.
|
protected int |
fEndColumnNumber
Ending column number.
|
protected int |
fEndLineNumber
Ending line number.
|
protected HTMLErrorReporter |
fErrorReporter
Error reporter.
|
protected boolean |
fFixWindowsCharRefs
Fix Microsoft Windows® character entity references.
|
protected java.lang.String |
fIANAEncoding
Auto-detected IANA encoding.
|
protected boolean |
fIgnoreSpecifiedCharset
Ignore specified character set.
|
protected boolean |
fInsertDoctype
Insert document type declaration.
|
protected boolean |
fIso8859Encoding
True if the encoding matches "ISO-8859-*".
|
static java.lang.String |
FIX_MSWINDOWS_REFS
Fix Microsoft Windows® character entity references.
|
protected java.lang.String |
fJavaEncoding
Auto-detected Java encoding.
|
protected short |
fNamesAttrs
Modify HTML attribute names.
|
protected short |
fNamesElems
Modify HTML element names.
|
protected boolean |
fNormalizeAttributes
Normalize attribute values.
|
protected boolean |
fNotifyCharRefs
Notify character entity references.
|
protected boolean |
fNotifyHtmlBuiltinRefs
Notify HTML built-in general entity references.
|
protected boolean |
fNotifyXmlBuiltinRefs
Notify XML built-in general entity references.
|
protected boolean |
fOverrideDoctype
Override doctype declaration public and system identifiers.
|
protected boolean |
fParseNoFramesContent
Parse noframes content.
|
protected boolean |
fParseNoScriptContent
Parse noscript content.
|
protected boolean |
fReportErrors
Report errors.
|
protected HTMLScanner.Scanner |
fScanner
The current scanner.
|
protected short |
fScannerState
The current scanner state.
|
protected boolean |
fScriptStripCDATADelims
Strip CDATA delimiters from SCRIPT tags.
|
protected boolean |
fScriptStripCommentDelims
Strip comment delimiters from SCRIPT tags.
|
protected HTMLScanner.SpecialScanner |
fSpecialScanner
Special scanner used for elements whose content needs to be scanned
as plain text, ignoring markup such as elements and entity references.
|
protected XMLStringBuffer |
fStringBuffer
String buffer.
|
protected boolean |
fStyleStripCDATADelims
Strip CDATA delimiters from STYLE tags.
|
protected boolean |
fStyleStripCommentDelims
Strip comment delimiters from STYLE tags.
|
static java.lang.String |
HTML_4_01_FRAMESET_PUBID
HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN").
|
static java.lang.String |
HTML_4_01_FRAMESET_SYSID
HTML 4.01 frameset system identifier ("http://www.w3.org/TR/html4/frameset.dtd").
|
static java.lang.String |
HTML_4_01_STRICT_PUBID
HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN").
|
static java.lang.String |
HTML_4_01_STRICT_SYSID
HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd").
|
static java.lang.String |
HTML_4_01_TRANSITIONAL_PUBID
HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 Transitional//EN").
|
static java.lang.String |
HTML_4_01_TRANSITIONAL_SYSID
HTML 4.01 transitional system identifier ("http://www.w3.org/TR/html4/loose.dtd").
|
static java.lang.String |
IGNORE_SPECIFIED_CHARSET
Ignore specified charset found in the <meta equiv='Content-Type'
content='text/html;charset=…'> tag or in the <?
|
static java.lang.String |
INSERT_DOCTYPE
Insert document type declaration.
|
protected static java.lang.String |
NAMES_ATTRS
Modify HTML attribute names: { "upper", "lower", "default" }.
|
protected static java.lang.String |
NAMES_ELEMS
Modify HTML element names: { "upper", "lower", "default" }.
|
protected static short |
NAMES_LOWERCASE
Lowercase HTML names.
|
protected static short |
NAMES_NO_CHANGE
Don't modify HTML names.
|
protected static short |
NAMES_UPPERCASE
Uppercase HTML names.
|
protected static java.lang.String |
NORMALIZE_ATTRIBUTES
Normalize attribute values.
|
static java.lang.String |
NOTIFY_CHAR_REFS
Notify character entity references (e.g.
|
static java.lang.String |
NOTIFY_HTML_BUILTIN_REFS
Notify handler of built-in entity references (e.g.
|
static java.lang.String |
NOTIFY_XML_BUILTIN_REFS
Notify handler of built-in entity references (e.g.
|
static java.lang.String |
OVERRIDE_DOCTYPE
Override doctype declaration public and system identifiers.
|
static java.lang.String |
PARSE_NOSCRIPT_CONTENT
Parse <noscript>...
|
protected static java.lang.String |
REPORT_ERRORS
Report errors.
|
static java.lang.String |
SCRIPT_STRIP_CDATA_DELIMS
Strip XHTML CDATA delimiters ("<!
|
static java.lang.String |
SCRIPT_STRIP_COMMENT_DELIMS
Strip HTML comment delimiters ("<!
|
protected static short |
STATE_CONTENT
State: content.
|
protected static short |
STATE_END_DOCUMENT
State: end document.
|
protected static short |
STATE_MARKUP_BRACKET
State: markup bracket.
|
protected static short |
STATE_START_DOCUMENT
State: start document.
|
static java.lang.String |
STYLE_STRIP_CDATA_DELIMS
Strip XHTML CDATA delimiters ("<!
|
static java.lang.String |
STYLE_STRIP_COMMENT_DELIMS
Strip HTML comment delimiters ("<!
|
protected static HTMLEventInfo |
SYNTHESIZED_ITEM
Synthesized event info item.
|
Constructor and Description |
---|
HTMLScanner() |
Modifier and Type | Method and Description |
---|---|
protected static boolean |
builtinXmlRef(java.lang.String name)
Returns true if the name is a built-in XML general entity reference.
|
void |
cleanup(boolean closeall)
Cleans up used resources.
|
void |
evaluateInputSource(XMLInputSource inputSource)
Immediately evaluates an input source and add the new content (e.g.
|
static java.lang.String |
expandSystemId(java.lang.String systemId,
java.lang.String baseSystemId)
Expands a system id and returns the system id as a URI, if
it can be expanded.
|
protected static java.lang.String |
fixURI(java.lang.String str)
Fixes a platform dependent filename to standard URI form.
|
protected int |
fixWindowsCharacter(int origChar)
Fixes Microsoft Windows® specific characters.
|
java.lang.String |
getBaseSystemId()
Returns the base system identifier.
|
int |
getCharacterOffset()
Returns the character offset.
|
int |
getColumnNumber()
Returns the current column number.
|
XMLDocumentHandler |
getDocumentHandler()
Returns the document handler.
|
java.lang.String |
getEncoding()
Returns the encoding.
|
java.lang.String |
getExpandedSystemId()
Returns the expanded system identifier.
|
java.lang.Boolean |
getFeatureDefault(java.lang.String featureId)
Returns the default state for a feature.
|
int |
getLineNumber()
Returns the current line number.
|
java.lang.String |
getLiteralSystemId()
Returns the literal system identifier.
|
protected static short |
getNamesValue(java.lang.String value)
Converts HTML names string value to constant value.
|
java.lang.Object |
getPropertyDefault(java.lang.String propertyId)
Returns the default state for a property.
|
java.lang.String |
getPublicId()
Returns the public identifier.
|
java.lang.String[] |
getRecognizedFeatures()
Returns recognized features.
|
java.lang.String[] |
getRecognizedProperties()
Returns recognized properties.
|
protected static java.lang.String |
getValue(XMLAttributes attrs,
java.lang.String aname)
Returns the value of the specified attribute, ignoring case.
|
java.lang.String |
getXMLVersion()
Returns the XML version.
|
protected Augmentations |
locationAugs()
Returns an augmentations object with a location item added.
|
protected static java.lang.String |
modifyName(java.lang.String name,
short mode)
Modifies the given name based on the specified mode.
|
void |
pushInputSource(XMLInputSource inputSource)
Pushes an input source onto the current entity stack.
|
protected int |
read()
Reads a single character.
|
protected int |
readPreservingBufferContent()
Reads a single character, preserving the old buffer content
|
void |
reset(XMLComponentManager manager)
Resets the component.
|
protected XMLResourceIdentifier |
resourceId()
Returns an empty resource identifier.
|
protected void |
scanDoctype()
Scans a DOCTYPE line.
|
boolean |
scanDocument(boolean complete)
Scans the document.
|
protected int |
scanEntityRef(XMLStringBuffer str,
boolean content)
Scans an entity reference.
|
protected java.lang.String |
scanLiteral()
Scans a quoted literal.
|
protected java.lang.String |
scanName(boolean strict)
Scans a name.
|
void |
setDocumentHandler(XMLDocumentHandler handler)
Sets the document handler.
|
void |
setFeature(java.lang.String featureId,
boolean state)
Sets a feature.
|
void |
setInputSource(XMLInputSource source)
Sets the input source.
|
void |
setProperty(java.lang.String propertyId,
java.lang.Object value)
Sets a property.
|
protected void |
setScanner(HTMLScanner.Scanner scanner)
Sets the scanner.
|
protected void |
setScannerState(short state)
Sets the scanner state.
|
protected boolean |
skip(java.lang.String s,
boolean caseSensitive)
Returns true if the specified text is present and is skipped.
|
protected boolean |
skipMarkup(boolean balance)
Skips markup.
|
protected int |
skipNewlines()
Skips newlines and returns the number of newlines skipped.
|
protected boolean |
skipSpaces()
Skips whitespace.
|
protected Augmentations |
synthesizedAugs()
Returns an augmentations object with a synthesized item added.
|
public static final java.lang.String HTML_4_01_STRICT_PUBID
public static final java.lang.String HTML_4_01_STRICT_SYSID
public static final java.lang.String HTML_4_01_TRANSITIONAL_PUBID
public static final java.lang.String HTML_4_01_TRANSITIONAL_SYSID
public static final java.lang.String HTML_4_01_FRAMESET_PUBID
public static final java.lang.String HTML_4_01_FRAMESET_SYSID
protected static final java.lang.String AUGMENTATIONS
protected static final java.lang.String REPORT_ERRORS
public static final java.lang.String NOTIFY_CHAR_REFS
public static final java.lang.String NOTIFY_XML_BUILTIN_REFS
Note: This only applies to the five pre-defined XML general entities. Specifically, "amp", "lt", "gt", "quot", and "apos". This is done for compatibility with the Xerces feature.
To be notified of the built-in entity references in HTML, set the
http://cyberneko.org/html/features/scanner/notify-builtin-refs
feature to true
.
public static final java.lang.String NOTIFY_HTML_BUILTIN_REFS
Note: This includes the five pre-defined XML general entities.
public static final java.lang.String FIX_MSWINDOWS_REFS
public static final java.lang.String SCRIPT_STRIP_COMMENT_DELIMS
public static final java.lang.String SCRIPT_STRIP_CDATA_DELIMS
public static final java.lang.String STYLE_STRIP_COMMENT_DELIMS
public static final java.lang.String STYLE_STRIP_CDATA_DELIMS
public static final java.lang.String IGNORE_SPECIFIED_CHARSET
public static final java.lang.String CDATA_SECTIONS
public static final java.lang.String OVERRIDE_DOCTYPE
public static final java.lang.String INSERT_DOCTYPE
public static final java.lang.String PARSE_NOSCRIPT_CONTENT
public static final java.lang.String ALLOW_SELFCLOSING_IFRAME
public static final java.lang.String ALLOW_SELFCLOSING_TAGS
protected static final java.lang.String NORMALIZE_ATTRIBUTES
protected static final java.lang.String NAMES_ELEMS
protected static final java.lang.String NAMES_ATTRS
protected static final java.lang.String DEFAULT_ENCODING
protected static final java.lang.String ERROR_REPORTER
protected static final java.lang.String DOCTYPE_PUBID
protected static final java.lang.String DOCTYPE_SYSID
protected static final short STATE_CONTENT
protected static final short STATE_MARKUP_BRACKET
protected static final short STATE_START_DOCUMENT
protected static final short STATE_END_DOCUMENT
protected static final short NAMES_NO_CHANGE
protected static final short NAMES_UPPERCASE
protected static final short NAMES_LOWERCASE
protected static final int DEFAULT_BUFFER_SIZE
protected static final boolean DEBUG_CALLBACKS
protected static final HTMLEventInfo SYNTHESIZED_ITEM
protected boolean fAugmentations
protected boolean fReportErrors
protected boolean fNotifyCharRefs
protected boolean fNotifyXmlBuiltinRefs
protected boolean fNotifyHtmlBuiltinRefs
protected boolean fFixWindowsCharRefs
protected boolean fScriptStripCDATADelims
protected boolean fScriptStripCommentDelims
protected boolean fStyleStripCDATADelims
protected boolean fStyleStripCommentDelims
protected boolean fIgnoreSpecifiedCharset
protected boolean fCDATASections
protected boolean fOverrideDoctype
protected boolean fInsertDoctype
protected boolean fNormalizeAttributes
protected boolean fParseNoScriptContent
protected boolean fParseNoFramesContent
protected boolean fAllowSelfclosingIframe
protected boolean fAllowSelfclosingTags
protected short fNamesElems
protected short fNamesAttrs
protected java.lang.String fDefaultIANAEncoding
protected HTMLErrorReporter fErrorReporter
protected java.lang.String fDoctypePubid
protected java.lang.String fDoctypeSysid
protected int fBeginLineNumber
protected int fBeginColumnNumber
protected int fBeginCharacterOffset
protected int fEndLineNumber
protected int fEndColumnNumber
protected int fEndCharacterOffset
protected HTMLScanner.PlaybackInputStream fByteStream
protected HTMLScanner.CurrentEntity fCurrentEntity
protected final java.util.Stack fCurrentEntityStack
protected HTMLScanner.Scanner fScanner
protected short fScannerState
protected XMLDocumentHandler fDocumentHandler
protected java.lang.String fIANAEncoding
protected java.lang.String fJavaEncoding
protected boolean fIso8859Encoding
protected int fElementCount
protected int fElementDepth
protected HTMLScanner.Scanner fContentScanner
protected HTMLScanner.SpecialScanner fSpecialScanner
protected final XMLStringBuffer fStringBuffer
public void pushInputSource(XMLInputSource inputSource)
Note: This functionality is experimental at this time and is subject to change in future releases of NekoHTML.
inputSource
- The new input source to start scanning.evaluateInputSource(XMLInputSource)
public void evaluateInputSource(XMLInputSource inputSource)
inputSource
- The new input source to start evaluating.pushInputSource(XMLInputSource)
public void cleanup(boolean closeall)
closeall
- Close all streams, including the original.
This is used in cases when the application has
opened the original document stream and should
be responsible for closing it.public java.lang.String getEncoding()
public java.lang.String getPublicId()
public java.lang.String getBaseSystemId()
public java.lang.String getLiteralSystemId()
public java.lang.String getExpandedSystemId()
public int getLineNumber()
public int getColumnNumber()
public java.lang.String getXMLVersion()
public int getCharacterOffset()
public java.lang.Boolean getFeatureDefault(java.lang.String featureId)
getFeatureDefault
in interface HTMLComponent
public java.lang.Object getPropertyDefault(java.lang.String propertyId)
getPropertyDefault
in interface HTMLComponent
public java.lang.String[] getRecognizedFeatures()
public java.lang.String[] getRecognizedProperties()
public void reset(XMLComponentManager manager) throws XMLConfigurationException
XMLConfigurationException
public void setFeature(java.lang.String featureId, boolean state)
public void setProperty(java.lang.String propertyId, java.lang.Object value) throws XMLConfigurationException
XMLConfigurationException
public void setInputSource(XMLInputSource source) throws java.io.IOException
java.io.IOException
public boolean scanDocument(boolean complete) throws XNIException, java.io.IOException
XNIException
java.io.IOException
public void setDocumentHandler(XMLDocumentHandler handler)
public XMLDocumentHandler getDocumentHandler()
protected static java.lang.String getValue(XMLAttributes attrs, java.lang.String aname)
public static java.lang.String expandSystemId(java.lang.String systemId, java.lang.String baseSystemId)
systemId
- The systemId to be expanded.protected static java.lang.String fixURI(java.lang.String str)
str
- The string to fix.protected static final java.lang.String modifyName(java.lang.String name, short mode)
protected static final short getNamesValue(java.lang.String value)
NAMES_NO_CHANGE
,
NAMES_LOWERCASE
,
NAMES_UPPERCASE
protected int fixWindowsCharacter(int origChar)
Details about this common problem can be found at http://www.cs.tut.fi/~jkorpela/www/windows-chars.html
protected int read() throws java.io.IOException
java.io.IOException
protected void setScanner(HTMLScanner.Scanner scanner)
protected void setScannerState(short state)
protected void scanDoctype() throws java.io.IOException
java.io.IOException
protected java.lang.String scanLiteral() throws java.io.IOException
java.io.IOException
protected java.lang.String scanName(boolean strict) throws java.io.IOException
java.io.IOException
protected int scanEntityRef(XMLStringBuffer str, boolean content) throws java.io.IOException
java.io.IOException
protected boolean skip(java.lang.String s, boolean caseSensitive) throws java.io.IOException
java.io.IOException
protected boolean skipMarkup(boolean balance) throws java.io.IOException
java.io.IOException
protected boolean skipSpaces() throws java.io.IOException
java.io.IOException
protected int skipNewlines() throws java.io.IOException
java.io.IOException
protected final Augmentations locationAugs()
protected final Augmentations synthesizedAugs()
protected final XMLResourceIdentifier resourceId()
protected static boolean builtinXmlRef(java.lang.String name)
protected int readPreservingBufferContent() throws java.io.IOException
java.io.IOException
(C) Copyright 2002-2014, Andy Clark, Marc Guillemot. All rights reserved.