Merge pull request 'merge version 1.' (#1) from master into main
Reviewed-on: https://src.isharkfly.com/honeymoose/HtmlCleaner/pulls/1
This commit is contained in:
Generated
+12
@@ -0,0 +1,12 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Environment-dependent path to Maven home directory
|
||||
/mavenHomeManager.xml
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
# Zeppelin ignored files
|
||||
/ZeppelinRemoteNotebooks/
|
||||
Generated
+16
@@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="CheckStyle-IDEA" serialisationVersion="2">
|
||||
<checkstyleVersion>10.23.0</checkstyleVersion>
|
||||
<scanScope>JavaOnly</scanScope>
|
||||
<copyLibs>true</copyLibs>
|
||||
<option name="thirdPartyClasspath" />
|
||||
<option name="activeLocationIds" />
|
||||
<option name="locations">
|
||||
<list>
|
||||
<ConfigurationLocation id="bundled-sun-checks" type="BUNDLED" scope="All" description="Sun Checks">(bundled)</ConfigurationLocation>
|
||||
<ConfigurationLocation id="bundled-google-checks" type="BUNDLED" scope="All" description="Google Checks">(bundled)</ConfigurationLocation>
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
||||
Generated
+13
@@ -0,0 +1,13 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="CompilerConfiguration">
|
||||
<annotationProcessing>
|
||||
<profile name="Maven default annotation processors profile" enabled="true">
|
||||
<sourceOutputDir name="target/generated-sources/annotations" />
|
||||
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
|
||||
<outputRelativeToContentRoot value="true" />
|
||||
<module name="htmlcleaner" />
|
||||
</profile>
|
||||
</annotationProcessing>
|
||||
</component>
|
||||
</project>
|
||||
Generated
+7
@@ -0,0 +1,7 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Encoding">
|
||||
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
|
||||
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
|
||||
</component>
|
||||
</project>
|
||||
Generated
+35
@@ -0,0 +1,35 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="RemoteRepositoriesConfiguration">
|
||||
<remote-repository>
|
||||
<option name="id" value="sonatype-nexus-snapshots" />
|
||||
<option name="name" value="Sonatype Nexus Snapshots" />
|
||||
<option name="url" value="https://oss.sonatype.org/content/repositories/snapshots" />
|
||||
</remote-repository>
|
||||
<remote-repository>
|
||||
<option name="id" value="ossez-repo-releases" />
|
||||
<option name="name" value="iSharkFly Private Releases" />
|
||||
<option name="url" value="https://repo.isharkfly.com/repository/isharkfly-maven-releases/" />
|
||||
</remote-repository>
|
||||
<remote-repository>
|
||||
<option name="id" value="central" />
|
||||
<option name="name" value="Central Repository" />
|
||||
<option name="url" value="https://repo.isharkfly.com/repository/maven/" />
|
||||
</remote-repository>
|
||||
<remote-repository>
|
||||
<option name="id" value="ossez-repo-snapshots" />
|
||||
<option name="name" value="iSharkFly Private Snapshots" />
|
||||
<option name="url" value="https://repo.isharkfly.com/repository/isharkfly-maven-snapshots/" />
|
||||
</remote-repository>
|
||||
<remote-repository>
|
||||
<option name="id" value="central" />
|
||||
<option name="name" value="Maven Central repository" />
|
||||
<option name="url" value="https://repo1.maven.org/maven2" />
|
||||
</remote-repository>
|
||||
<remote-repository>
|
||||
<option name="id" value="jboss.community" />
|
||||
<option name="name" value="JBoss Community repository" />
|
||||
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
|
||||
</remote-repository>
|
||||
</component>
|
||||
</project>
|
||||
Generated
+6
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
@@ -1,6 +1,6 @@
|
||||
HtmlCleaner is a project originally developed by Vladimir Nikic (http://htmlcleaner.sourceforge.net/).
|
||||
|
||||
This version is modified by Zheng Sun.
|
||||
This version is modified by iSharkFly.
|
||||
|
||||
Briefly speaking, the modifications are
|
||||
|
||||
|
||||
@@ -0,0 +1,44 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
|
||||
Additional work by Amplafi. -- All rights released.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
public interface AttributeTransformation {
|
||||
boolean satisfy(String attName, String attValue);
|
||||
String getTemplate();
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
|
||||
Additional work by Amplafi. -- All rights released.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class AttributeTransformationPatternImpl implements AttributeTransformation {
|
||||
private final Pattern attNamePattern;
|
||||
private final Pattern attValuePattern;
|
||||
private final String template;
|
||||
public AttributeTransformationPatternImpl(Pattern attNamePattern, Pattern attValuePattern, String template) {
|
||||
this.attNamePattern = attNamePattern;
|
||||
this.attValuePattern = attValuePattern;
|
||||
this.template = template;
|
||||
}
|
||||
public AttributeTransformationPatternImpl(String attNamePattern, String attValuePattern, String template) {
|
||||
this.attNamePattern = attNamePattern ==null?null:Pattern.compile(attNamePattern);
|
||||
this.attValuePattern = attValuePattern == null? null: Pattern.compile(attValuePattern);
|
||||
this.template = template;
|
||||
}
|
||||
|
||||
public boolean satisfy(String attName, String attValue) {
|
||||
if ( (attNamePattern == null || attNamePattern.matcher(attName).find()) && (attValuePattern ==null || attValuePattern.matcher(attValue).find())){
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the template
|
||||
*/
|
||||
public String getTemplate() {
|
||||
return template;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class BaseHtmlNode extends BaseTokenImpl implements HtmlNode {
|
||||
|
||||
protected TagNode parent;
|
||||
|
||||
public List<? extends BaseToken> getSiblings(){
|
||||
//
|
||||
// If this is a root node, return an empty list
|
||||
//
|
||||
if (this.parent == null) { return new ArrayList<BaseToken>(); };
|
||||
//
|
||||
// Otherwise, return all the children, including this node
|
||||
//
|
||||
return this.parent.getAllChildren();
|
||||
}
|
||||
|
||||
public TagNode getParent() {
|
||||
return parent;
|
||||
}
|
||||
|
||||
public void setParent(TagNode parent) {
|
||||
this.parent = parent;
|
||||
}
|
||||
|
||||
public void serialize(Serializer serializer, Writer writer)
|
||||
throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Base token interface. Tokens are individual entities recognized by HTML parser.
|
||||
* </p>
|
||||
*/
|
||||
public interface BaseToken {
|
||||
|
||||
public void serialize(Serializer serializer, Writer writer) throws IOException;
|
||||
|
||||
/**
|
||||
* @return row in source html where the token was found
|
||||
*/
|
||||
public int getRow();
|
||||
|
||||
/**
|
||||
* @param row
|
||||
*/
|
||||
public void setRow(int row);
|
||||
|
||||
/**
|
||||
* @return col in source html where the token was found
|
||||
*/
|
||||
public int getCol();
|
||||
|
||||
/**
|
||||
* @param col
|
||||
*/
|
||||
public void setCol(int col);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* Base class for all tokens. Allows position tracking.
|
||||
*
|
||||
* @author Konstantin Burov (aectann@gmail.com)
|
||||
*
|
||||
*/
|
||||
public abstract class BaseTokenImpl implements BaseToken {
|
||||
|
||||
private int row;
|
||||
private int col;
|
||||
|
||||
protected BaseTokenImpl(){
|
||||
|
||||
}
|
||||
|
||||
protected BaseTokenImpl(int row, int col) {
|
||||
this.row = row;
|
||||
this.col = col;
|
||||
}
|
||||
public int getRow() {
|
||||
return row;
|
||||
}
|
||||
public void setRow(int row) {
|
||||
this.row = row;
|
||||
}
|
||||
public int getCol() {
|
||||
return col;
|
||||
}
|
||||
public void setCol(int col) {
|
||||
this.col = col;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "(line="+getRow()+", col="+getCol()+")";
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* @author patmoore
|
||||
*
|
||||
*/
|
||||
public enum BelongsTo {
|
||||
|
||||
HEAD_AND_BODY("all"),
|
||||
HEAD("head"),
|
||||
BODY("body");
|
||||
private final String dbCode;
|
||||
private BelongsTo(String dbCode) {
|
||||
this.dbCode =dbCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the dbCode
|
||||
*/
|
||||
public String getDbCode() {
|
||||
return dbCode;
|
||||
}
|
||||
|
||||
public static BelongsTo toValue(Object value) {
|
||||
BelongsTo result = null;
|
||||
if ( value instanceof BelongsTo) {
|
||||
result = (BelongsTo) value;
|
||||
} else if ( value != null ) {
|
||||
String dbCode = value.toString().trim();
|
||||
for(BelongsTo belongsTo: BelongsTo.values()) {
|
||||
if ( belongsTo.getDbCode().equalsIgnoreCase(dbCode) || belongsTo.name().equalsIgnoreCase(dbCode)) {
|
||||
result = belongsTo;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Browser compact XML serializer - creates resulting XML by stripping whitespaces wherever possible,
|
||||
* but preserving single whitespace where at least one exists. This behaviour is well suited
|
||||
* for web-browsers, which usually treat multiple whitespaces as single one, but make difference
|
||||
* between single whitespace and empty text.
|
||||
* </p>
|
||||
*/
|
||||
public class BrowserCompactXmlSerializer extends XmlSerializer {
|
||||
|
||||
private static final String PRE_TAG = "pre";
|
||||
private static final String BR_TAG = "<br />";
|
||||
private static final String LINE_BREAK = "\n";
|
||||
|
||||
public BrowserCompactXmlSerializer(CleanerProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializeOpenTag(tagNode, writer, false);
|
||||
TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
|
||||
String tagName = tagInfo!=null? tagInfo.getName() : null;
|
||||
List<? extends BaseToken> tagChildren = new ArrayList<BaseToken>(tagNode.getAllChildren());
|
||||
if (!isMinimizedTagSyntax(tagNode)) {
|
||||
ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
|
||||
while (childrenIt.hasNext()) {
|
||||
Object item = childrenIt.next();
|
||||
if (item != null) {
|
||||
if (item instanceof ContentNode && !PRE_TAG.equals(tagName)) {
|
||||
String content = ((ContentNode) item).getContent();
|
||||
content = dontEscape(tagNode) ? content.replaceAll("]]>", "]]>") : escapeXml(content);
|
||||
content = content.replaceAll("^"+SpecialEntities.NON_BREAKABLE_SPACE+"+", " ");
|
||||
content = content.replaceAll(SpecialEntities.NON_BREAKABLE_SPACE+"+$", " ");
|
||||
boolean whitespaceAllowed = tagInfo != null && tagInfo.getDisplay().isLeadingAndEndWhitespacesAllowed();
|
||||
boolean writeLeadingSpace = content.length() > 0 && (Character.isWhitespace(content.charAt(0)));
|
||||
boolean writeEndingSpace = content.length() > 1 && Character.isWhitespace(content.charAt(content.length() - 1));
|
||||
content = content.trim();
|
||||
if (content.length() != 0) {
|
||||
boolean hasPrevContent = false;
|
||||
int order = tagChildren.indexOf(item);
|
||||
if (order >= 2) {
|
||||
Object prev = tagChildren.get(order-1);
|
||||
hasPrevContent = isContentOrInline(prev);
|
||||
}
|
||||
|
||||
if (writeLeadingSpace && (whitespaceAllowed || hasPrevContent)) {
|
||||
writer.write(' ');
|
||||
}
|
||||
|
||||
StringTokenizer tokenizer = new StringTokenizer(content, LINE_BREAK, true);
|
||||
String prevToken = "";
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String token = tokenizer.nextToken();
|
||||
if (prevToken.equals(token) && prevToken.equals(LINE_BREAK)) {
|
||||
writer.write(BR_TAG);
|
||||
prevToken = "";
|
||||
} else if (LINE_BREAK.equals(token)) {
|
||||
writer.write(' ');
|
||||
} else {
|
||||
writer.write(token.trim());
|
||||
}
|
||||
prevToken = token;
|
||||
}
|
||||
|
||||
boolean hasFollowingContent = false;
|
||||
if (childrenIt.hasNext()) {
|
||||
Object next = childrenIt.next();
|
||||
hasFollowingContent = isContentOrInline(next);
|
||||
childrenIt.previous();
|
||||
}
|
||||
|
||||
if (writeEndingSpace && (whitespaceAllowed || hasFollowingContent)) {
|
||||
writer.write(' ');
|
||||
}
|
||||
} else{
|
||||
childrenIt.remove();
|
||||
}
|
||||
} else if(item instanceof ContentNode){
|
||||
String content = ((ContentNode) item).getContent();
|
||||
writer.write(content);
|
||||
} else if (item instanceof CommentNode) {
|
||||
String content = ((CommentNode) item).getCommentedContent().trim();
|
||||
writer.write(content);
|
||||
} else {
|
||||
((BaseToken)item).serialize(this, writer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
serializeEndTag(tagNode, writer, tagInfo != null && tagInfo.getDisplay().isAfterTagLineBreakNeeded());
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isContentOrInline(Object node) {
|
||||
boolean result = false;
|
||||
if (node instanceof ContentNode) {
|
||||
result = true;
|
||||
} else if (node instanceof TagNode) {
|
||||
TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName());
|
||||
result = nextInfo != null && nextInfo.getDisplay() == Display.inline;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/* Copyright (c) 2006-2013, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
public class CData extends ContentNode implements HtmlNode {
|
||||
|
||||
public static final String BEGIN_CDATA = "<![CDATA[";
|
||||
public static final String END_CDATA = "]]>";
|
||||
public static final String SAFE_BEGIN_CDATA = "/*<![CDATA[*/";
|
||||
public static final String SAFE_END_CDATA = "/*]]>*/";
|
||||
public static final String SAFE_BEGIN_CDATA_ALT = "//<![CDATA[";
|
||||
public static final String SAFE_END_CDATA_ALT = "//]]>";
|
||||
|
||||
public CData(String content){
|
||||
super(content);
|
||||
}
|
||||
|
||||
public String getContentWithoutStartAndEndTokens(){
|
||||
return this.content;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.htmlcleaner.ContentNode#getContent()
|
||||
*/
|
||||
@Override
|
||||
public String getContent() {
|
||||
return getContentWithoutStartAndEndTokens();
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.htmlcleaner.ContentNode#toString()
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return getContentWithStartAndEndTokens();
|
||||
}
|
||||
|
||||
public String getContentWithStartAndEndTokens(){
|
||||
return SAFE_BEGIN_CDATA + this.content + SAFE_END_CDATA;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
* Contains information about nodes that were closed due to their child nodes.
|
||||
* i.e. if 'p' tag was closed due to 'table' child tag.
|
||||
*
|
||||
* @author Konstantin Burov
|
||||
*
|
||||
*/
|
||||
class ChildBreaks{
|
||||
Stack < TagPos> closedByChildBreak = new Stack < TagPos >();
|
||||
private Stack < TagPos > breakingTags = new Stack < TagPos >();
|
||||
|
||||
/**
|
||||
* Adds the break info to the top of the stacks.
|
||||
*
|
||||
* @param closedPos - position of the tag that was closed due to incorrect child
|
||||
* @param breakPos - position of the child that has broken its parent
|
||||
*/
|
||||
public void addBreak(TagPos closedPos, TagPos breakPos){
|
||||
closedByChildBreak.add(closedPos);
|
||||
breakingTags.add(breakPos);
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return closedByChildBreak.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return name of the last children tag that has broken its parent.
|
||||
*/
|
||||
public String getLastBreakingTag() {
|
||||
return breakingTags.peek().name;
|
||||
}
|
||||
|
||||
/**
|
||||
* pops out latest broken tag position.
|
||||
*
|
||||
* @return tag pos of the last parent that was broken.
|
||||
*/
|
||||
public TagPos pop() {
|
||||
breakingTags.pop();
|
||||
return closedByChildBreak.pop();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return position of the last tag that has broken its parent. -1 if no such tag found.
|
||||
*/
|
||||
public int getLastBreakingTagPosition() {
|
||||
return breakingTags.isEmpty()?-1:breakingTags.peek().position;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
/* Copyright (c) 2006-2013, HtmlCleaner Team (Vladimir Nikic, Pat Moore, Scott Wilson)
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
import java.util.Stack;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.htmlcleaner.conditional.ITagNodeCondition;
|
||||
|
||||
/**
|
||||
* This class is for thread-safe handling of private instance variables from HtmlCleaner
|
||||
*/
|
||||
class CleanTimeValues {
|
||||
|
||||
boolean _headOpened = false;
|
||||
boolean _bodyOpened = false;
|
||||
@SuppressWarnings("rawtypes")
|
||||
Set _headTags = new LinkedHashSet();
|
||||
@SuppressWarnings("rawtypes")
|
||||
Set allTags = new TreeSet();
|
||||
transient Stack<NestingState> nestingStates = new Stack<NestingState>();
|
||||
|
||||
TagNode htmlNode;
|
||||
TagNode bodyNode;
|
||||
TagNode headNode;
|
||||
TagNode rootNode;
|
||||
|
||||
Set<ITagNodeCondition> pruneTagSet = new HashSet<ITagNodeCondition>();
|
||||
Set<TagNode> pruneNodeSet = new HashSet<TagNode>();
|
||||
Set<ITagNodeCondition> allowTagSet;
|
||||
|
||||
/**
|
||||
* A stack of namespaces for currently open tags. Every xmlns declaration
|
||||
* on a tag adds another namespace to the stack, which is removed when the
|
||||
* tag is closed. In this way you can keep track of what namespace a tag
|
||||
* belongs to.
|
||||
*/
|
||||
transient Stack<String> namespace = new Stack<String>();
|
||||
|
||||
/**
|
||||
* A map of all the namespace prefixes and URIs declared within the document.
|
||||
* We use this to check whether any prefixes remain undeclared.
|
||||
*/
|
||||
transient HashMap<String, String> namespaceMap = new HashMap<String, String>();
|
||||
}
|
||||
@@ -0,0 +1,665 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import org.htmlcleaner.audit.ErrorType;
|
||||
import org.htmlcleaner.audit.HtmlModificationListener;
|
||||
import org.htmlcleaner.conditional.ITagNodeCondition;
|
||||
import org.htmlcleaner.conditional.TagNodeAutoGeneratedCondition;
|
||||
import org.htmlcleaner.conditional.TagNodeNameCondition;
|
||||
|
||||
/**
|
||||
* Properties defining cleaner's behaviour
|
||||
*/
|
||||
public class CleanerProperties implements HtmlModificationListener{
|
||||
// Force consistent cross-platform encoding ( mandatory for reliable server operation)
|
||||
public static final String DEFAULT_CHARSET = "UTF-8";
|
||||
public static final String BOOL_ATT_SELF = "self";
|
||||
public static final String BOOL_ATT_EMPTY = "empty";
|
||||
public static final String BOOL_ATT_TRUE = "true";
|
||||
|
||||
private ITagInfoProvider tagInfoProvider;
|
||||
/**
|
||||
* If this parameter is set to true, ampersand sign (&) that proceeds valid XML character sequences (&XXX;) will not be escaped with &XXX;
|
||||
*/
|
||||
private boolean advancedXmlEscape;
|
||||
private String useCdataFor;
|
||||
private List<String> useCdataForList;
|
||||
private boolean translateSpecialEntities;
|
||||
private boolean recognizeUnicodeChars;
|
||||
private boolean omitUnknownTags;
|
||||
private boolean treatUnknownTagsAsContent;
|
||||
private boolean omitDeprecatedTags;
|
||||
private boolean omitComments;
|
||||
private boolean treatDeprecatedTagsAsContent;
|
||||
private OptionalOutput omitXmlDeclaration;
|
||||
private OptionalOutput omitDoctypeDeclaration;
|
||||
private OptionalOutput omitHtmlEnvelope;
|
||||
private boolean useEmptyElementTags;
|
||||
private boolean allowMultiWordAttributes;
|
||||
private String booleanAttributeValues;
|
||||
private boolean ignoreQuestAndExclam;
|
||||
private boolean allowHtmlInsideAttributes;
|
||||
private boolean namespacesAware;
|
||||
private boolean transSpecialEntitiesToNCR;
|
||||
private boolean omitCdataOutsideScriptAndStyle;
|
||||
private boolean deserializeEntities;
|
||||
private boolean trimAttributeValues;
|
||||
private int htmlVersion;
|
||||
|
||||
private boolean allowInvalidAttributeNames;
|
||||
private String invalidAttributeNamePrefix;
|
||||
|
||||
/**
|
||||
* Provides an arbitrary recursion depth
|
||||
*/
|
||||
private int maxDepth;
|
||||
public int getMaxDepth() {
|
||||
return maxDepth;
|
||||
}
|
||||
public void setMaxDepth(int maxDepth) {
|
||||
this.maxDepth = maxDepth;
|
||||
}
|
||||
|
||||
/**
|
||||
* "cause the cleaner cannot keep track of whitespace at that level",
|
||||
* there are 2 lists built: one for the head , one for the body. So whitespace that falls outside of the head and body is not preserved
|
||||
* this creates at least a newline break.
|
||||
*
|
||||
* More work than really wanted at this point to "preserve" the whitespace.
|
||||
*/
|
||||
private boolean addNewlineToHeadAndBody;
|
||||
/**
|
||||
* Tries to keep inside head all whitespace and comments that were originally there
|
||||
*/
|
||||
private boolean keepWhitespaceAndCommentsInHead;
|
||||
private String hyphenReplacementInComment;
|
||||
// comma separate list of tags pruned.
|
||||
private String pruneTags;
|
||||
// comma separate list of tags allowed.
|
||||
private String allowTags;
|
||||
|
||||
private CleanerTransformations cleanerTransformations = new CleanerTransformations();
|
||||
|
||||
private List < HtmlModificationListener > htmlModificationListeners;
|
||||
|
||||
/**
|
||||
* blacklist of tags
|
||||
*/
|
||||
private Set<ITagNodeCondition> pruneTagSet = new HashSet<ITagNodeCondition>();
|
||||
/**
|
||||
* the list of allowed tags (whitelist approach v. blacklist approach of pruneTags )
|
||||
*/
|
||||
private Set<ITagNodeCondition> allowTagSet = new HashSet<ITagNodeCondition>();
|
||||
private String charset = DEFAULT_CHARSET;
|
||||
private boolean transResCharsToNCR;
|
||||
|
||||
public CleanerProperties() {
|
||||
reset();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagInfoProvider
|
||||
*/
|
||||
public CleanerProperties(ITagInfoProvider tagInfoProvider) {
|
||||
reset();
|
||||
this.tagInfoProvider = tagInfoProvider;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagInfoProvider the tagInfoProvider to set
|
||||
*/
|
||||
void setTagInfoProvider(ITagInfoProvider tagInfoProvider) {
|
||||
this.tagInfoProvider = tagInfoProvider;
|
||||
}
|
||||
|
||||
public ITagInfoProvider getTagInfoProvider() {
|
||||
return tagInfoProvider;
|
||||
}
|
||||
|
||||
public boolean isAdvancedXmlEscape() {
|
||||
return advancedXmlEscape;
|
||||
}
|
||||
|
||||
public void setAdvancedXmlEscape(boolean advancedXmlEscape) {
|
||||
this.advancedXmlEscape = advancedXmlEscape;
|
||||
}
|
||||
|
||||
public boolean isTransResCharsToNCR() {
|
||||
return transResCharsToNCR;
|
||||
}
|
||||
|
||||
public void setTransResCharsToNCR(boolean transResCharsToNCR) {
|
||||
this.transResCharsToNCR = transResCharsToNCR;
|
||||
}
|
||||
|
||||
public boolean isUseCdataForScriptAndStyle() {
|
||||
return isUseCdataFor("script") && isUseCdataFor("style");
|
||||
}
|
||||
|
||||
public void setUseCdataForScriptAndStyle(boolean useCdataForScriptAndStyle) {
|
||||
if (useCdataForScriptAndStyle)
|
||||
setUseCdataFor("script,style");
|
||||
else
|
||||
setUseCdataFor("");
|
||||
}
|
||||
|
||||
public void setUseCdataFor(String useCdataFor) {
|
||||
if (useCdataFor != null) {
|
||||
this.useCdataFor = useCdataFor;
|
||||
this.useCdataForList = Arrays.asList(useCdataFor.toLowerCase().split(","));
|
||||
} else {
|
||||
this.useCdataFor = "";
|
||||
this.useCdataForList = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public String getUseCdataFor() {
|
||||
return this.useCdataFor;
|
||||
}
|
||||
|
||||
public boolean isUseCdataFor(String useCdataFor) {
|
||||
if (useCdataForList != null && useCdataFor != null)
|
||||
return useCdataForList.contains(useCdataFor.toLowerCase());
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isTranslateSpecialEntities() {
|
||||
return translateSpecialEntities;
|
||||
}
|
||||
|
||||
/**
|
||||
* TODO : use {@link OptionalOutput}
|
||||
* @param translateSpecialEntities
|
||||
*/
|
||||
public void setTranslateSpecialEntities(boolean translateSpecialEntities) {
|
||||
this.translateSpecialEntities = translateSpecialEntities;
|
||||
}
|
||||
|
||||
public boolean isRecognizeUnicodeChars() {
|
||||
return recognizeUnicodeChars;
|
||||
}
|
||||
|
||||
public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) {
|
||||
this.recognizeUnicodeChars = recognizeUnicodeChars;
|
||||
}
|
||||
|
||||
public boolean isOmitUnknownTags() {
|
||||
return omitUnknownTags;
|
||||
}
|
||||
|
||||
public void setOmitUnknownTags(boolean omitUnknownTags) {
|
||||
this.omitUnknownTags = omitUnknownTags;
|
||||
}
|
||||
|
||||
public boolean isTreatUnknownTagsAsContent() {
|
||||
return treatUnknownTagsAsContent;
|
||||
}
|
||||
|
||||
public void setTreatUnknownTagsAsContent(boolean treatUnknownTagsAsContent) {
|
||||
this.treatUnknownTagsAsContent = treatUnknownTagsAsContent;
|
||||
}
|
||||
|
||||
public boolean isOmitDeprecatedTags() {
|
||||
return omitDeprecatedTags;
|
||||
}
|
||||
|
||||
public void setOmitDeprecatedTags(boolean omitDeprecatedTags) {
|
||||
this.omitDeprecatedTags = omitDeprecatedTags;
|
||||
}
|
||||
|
||||
public boolean isTreatDeprecatedTagsAsContent() {
|
||||
return treatDeprecatedTagsAsContent;
|
||||
}
|
||||
|
||||
public void setTreatDeprecatedTagsAsContent(boolean treatDeprecatedTagsAsContent) {
|
||||
this.treatDeprecatedTagsAsContent = treatDeprecatedTagsAsContent;
|
||||
}
|
||||
|
||||
public boolean isOmitComments() {
|
||||
return omitComments;
|
||||
}
|
||||
|
||||
public void setOmitComments(boolean omitComments) {
|
||||
this.omitComments = omitComments;
|
||||
}
|
||||
|
||||
public boolean isOmitXmlDeclaration() {
|
||||
return omitXmlDeclaration == OptionalOutput.omit;
|
||||
}
|
||||
|
||||
public void setOmitXmlDeclaration(boolean omitXmlDeclaration) {
|
||||
this.omitXmlDeclaration = omitXmlDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return also return true if omitting the Html Envelope
|
||||
*/
|
||||
public boolean isOmitDoctypeDeclaration() {
|
||||
return omitDoctypeDeclaration == OptionalOutput.omit || isOmitHtmlEnvelope();
|
||||
}
|
||||
|
||||
public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) {
|
||||
this.omitDoctypeDeclaration = omitDoctypeDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput;
|
||||
}
|
||||
|
||||
public boolean isOmitHtmlEnvelope() {
|
||||
return omitHtmlEnvelope == OptionalOutput.omit;
|
||||
}
|
||||
|
||||
public void setOmitHtmlEnvelope(boolean omitHtmlEnvelope) {
|
||||
this.omitHtmlEnvelope = omitHtmlEnvelope?OptionalOutput.omit:OptionalOutput.alwaysOutput;
|
||||
}
|
||||
|
||||
public boolean isUseEmptyElementTags() {
|
||||
return useEmptyElementTags;
|
||||
}
|
||||
|
||||
public void setUseEmptyElementTags(boolean useEmptyElementTags) {
|
||||
this.useEmptyElementTags = useEmptyElementTags;
|
||||
}
|
||||
|
||||
public boolean isAllowMultiWordAttributes() {
|
||||
return allowMultiWordAttributes;
|
||||
}
|
||||
|
||||
public void setAllowMultiWordAttributes(boolean allowMultiWordAttributes) {
|
||||
this.allowMultiWordAttributes = allowMultiWordAttributes;
|
||||
}
|
||||
|
||||
public boolean isAllowHtmlInsideAttributes() {
|
||||
return allowHtmlInsideAttributes;
|
||||
}
|
||||
|
||||
public void setAllowHtmlInsideAttributes(boolean allowHtmlInsideAttributes) {
|
||||
this.allowHtmlInsideAttributes = allowHtmlInsideAttributes;
|
||||
}
|
||||
|
||||
public boolean isIgnoreQuestAndExclam() {
|
||||
return ignoreQuestAndExclam;
|
||||
}
|
||||
|
||||
public void setIgnoreQuestAndExclam(boolean ignoreQuestAndExclam) {
|
||||
this.ignoreQuestAndExclam = ignoreQuestAndExclam;
|
||||
}
|
||||
|
||||
public boolean isNamespacesAware() {
|
||||
return namespacesAware;
|
||||
}
|
||||
|
||||
public void setNamespacesAware(boolean namespacesAware) {
|
||||
this.namespacesAware = namespacesAware;
|
||||
}
|
||||
|
||||
public boolean isAddNewlineToHeadAndBody() {
|
||||
return addNewlineToHeadAndBody;
|
||||
}
|
||||
|
||||
public void setAddNewlineToHeadAndBody(boolean addNewlineToHeadAndBody) {
|
||||
this.addNewlineToHeadAndBody = addNewlineToHeadAndBody;
|
||||
}
|
||||
|
||||
public boolean isKeepWhitespaceAndCommentsInHead() {
|
||||
return keepWhitespaceAndCommentsInHead;
|
||||
}
|
||||
|
||||
public void setKeepWhitespaceAndCommentsInHead(boolean keepHeadWhitespace) {
|
||||
this.keepWhitespaceAndCommentsInHead = keepHeadWhitespace;
|
||||
}
|
||||
|
||||
public String getHyphenReplacementInComment() {
|
||||
return hyphenReplacementInComment;
|
||||
}
|
||||
|
||||
public void setHyphenReplacementInComment(String hyphenReplacementInComment) {
|
||||
this.hyphenReplacementInComment = hyphenReplacementInComment;
|
||||
}
|
||||
|
||||
public String getPruneTags() {
|
||||
return pruneTags;
|
||||
}
|
||||
|
||||
public boolean isOmitCdataOutsideScriptAndStyle(){
|
||||
return omitCdataOutsideScriptAndStyle;
|
||||
}
|
||||
public void setOmitCdataOutsideScriptAndStyle(boolean value){
|
||||
omitCdataOutsideScriptAndStyle = value;
|
||||
}
|
||||
|
||||
public boolean isDeserializeEntities() {
|
||||
return deserializeEntities;
|
||||
}
|
||||
|
||||
public void setDeserializeEntities(boolean deserializeEntities) {
|
||||
this.deserializeEntities = deserializeEntities;
|
||||
}
|
||||
/**
|
||||
* Sets the html version according to the parameter.Also,it sets the
|
||||
* tag provider to the appropriate version.
|
||||
*
|
||||
* @param version Number 4 for html4 or 5 for html5
|
||||
*/
|
||||
public void setHtmlVersion(int version){
|
||||
this.htmlVersion=version;
|
||||
if (version==4)
|
||||
this.setTagInfoProvider(Html4TagProvider.INSTANCE);
|
||||
else
|
||||
this.setTagInfoProvider(Html5TagProvider.INSTANCE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the html version
|
||||
* @return int The html version
|
||||
*/
|
||||
public int getHtmlVersion (){
|
||||
return this.htmlVersion;
|
||||
}
|
||||
|
||||
public boolean isTrimAttributeValues() {
|
||||
return trimAttributeValues;
|
||||
}
|
||||
|
||||
public void setTrimAttributeValues(boolean trimAttributeValues) {
|
||||
this.trimAttributeValues = trimAttributeValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets prune tags set and adds tag name conditions to it.
|
||||
* All the tags listed by pruneTags param are added.
|
||||
*
|
||||
* @param pruneTags
|
||||
*/
|
||||
public void setPruneTags(String pruneTags) {
|
||||
this.pruneTags = pruneTags;
|
||||
this.resetPruneTagSet();
|
||||
this.addTagNameConditions(this.pruneTagSet, pruneTags);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the condition to existing prune tag set.
|
||||
*
|
||||
* @param condition
|
||||
*/
|
||||
public void addPruneTagNodeCondition(ITagNodeCondition condition){
|
||||
pruneTagSet.add(condition);
|
||||
}
|
||||
|
||||
public Set<ITagNodeCondition> getPruneTagSet() {
|
||||
return pruneTagSet;
|
||||
}
|
||||
|
||||
public String getAllowTags() {
|
||||
return allowTags;
|
||||
}
|
||||
|
||||
public void setAllowTags(String allowTags) {
|
||||
this.allowTags = allowTags;
|
||||
this.setAllowTagSet(allowTags);
|
||||
}
|
||||
|
||||
private void setAllowTagSet(String allowTags) {
|
||||
allowTagSet.clear();
|
||||
addTagNameConditions(allowTagSet, allowTags);
|
||||
}
|
||||
|
||||
|
||||
public boolean isTransSpecialEntitiesToNCR() {
|
||||
return transSpecialEntitiesToNCR;
|
||||
}
|
||||
|
||||
public void setTransSpecialEntitiesToNCR(boolean transSpecialEntitiesToNCR) {
|
||||
this.transSpecialEntitiesToNCR = transSpecialEntitiesToNCR;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagSet
|
||||
* @param tagsNameStr
|
||||
*/
|
||||
private void addTagNameConditions(Set<ITagNodeCondition> tagSet, String tagsNameStr) {
|
||||
if (tagsNameStr != null) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(tagsNameStr, ",");
|
||||
while ( tokenizer.hasMoreTokens() ) {
|
||||
tagSet.add( new TagNodeNameCondition(tokenizer.nextToken().trim().toLowerCase()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Set<ITagNodeCondition> getAllowTagSet() {
|
||||
return allowTagSet;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param charset the charset to set
|
||||
*/
|
||||
public void setCharset(String charset) {
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the charset
|
||||
*/
|
||||
public String getCharset() {
|
||||
return charset;
|
||||
}
|
||||
|
||||
public String getBooleanAttributeValues() {
|
||||
return booleanAttributeValues;
|
||||
}
|
||||
|
||||
public void setBooleanAttributeValues(String booleanAttributeValues) {
|
||||
if ( BOOL_ATT_SELF.equalsIgnoreCase(booleanAttributeValues) ||
|
||||
BOOL_ATT_EMPTY.equalsIgnoreCase(booleanAttributeValues) ||
|
||||
BOOL_ATT_TRUE.equalsIgnoreCase(booleanAttributeValues) ) {
|
||||
this.booleanAttributeValues = booleanAttributeValues.toLowerCase();
|
||||
} else {
|
||||
this.booleanAttributeValues = BOOL_ATT_SELF;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* advancedXmlEscape = true;
|
||||
* setUseCdataFor("script,style");
|
||||
* translateSpecialEntities = true;
|
||||
* recognizeUnicodeChars = true;
|
||||
* omitUnknownTags = false;
|
||||
* treatUnknownTagsAsContent = false;
|
||||
* omitDeprecatedTags = false;
|
||||
* treatDeprecatedTagsAsContent = false;
|
||||
* omitComments = false;
|
||||
* omitXmlDeclaration = OptionalOutput.alwaysOutput;
|
||||
* omitDoctypeDeclaration = OptionalOutput.alwaysOutput;
|
||||
* omitHtmlEnvelope = OptionalOutput.alwaysOutput;
|
||||
* useEmptyElementTags = true;
|
||||
* allowMultiWordAttributes = true;
|
||||
* allowHtmlInsideAttributes = false;
|
||||
* ignoreQuestAndExclam = true;
|
||||
* namespacesAware = true;
|
||||
* keepHeadWhitespace = true;
|
||||
* addNewlineToHeadAndBody = true;
|
||||
* hyphenReplacementInComment = "=";
|
||||
* pruneTags = null;
|
||||
* allowTags = null;
|
||||
* booleanAttributeValues = BOOL_ATT_SELF;
|
||||
* collapseNullHtml = CollapseHtml.none
|
||||
* charset = "UTF-8";
|
||||
* trimAttributeValues = true;
|
||||
* tagInfoProvider = HTML5TagProvider.INSTANCE
|
||||
* maxDepth = 1000
|
||||
*/
|
||||
public void reset() {
|
||||
advancedXmlEscape = true;
|
||||
setUseCdataFor("script,style");
|
||||
translateSpecialEntities = true;
|
||||
recognizeUnicodeChars = true;
|
||||
omitUnknownTags = false;
|
||||
treatUnknownTagsAsContent = false;
|
||||
omitDeprecatedTags = false;
|
||||
treatDeprecatedTagsAsContent = false;
|
||||
omitComments = false;
|
||||
omitXmlDeclaration = OptionalOutput.alwaysOutput;
|
||||
omitDoctypeDeclaration = OptionalOutput.alwaysOutput;
|
||||
omitHtmlEnvelope = OptionalOutput.alwaysOutput;
|
||||
useEmptyElementTags = true;
|
||||
allowMultiWordAttributes = true;
|
||||
allowHtmlInsideAttributes = false;
|
||||
ignoreQuestAndExclam = true;
|
||||
namespacesAware = true;
|
||||
addNewlineToHeadAndBody = true;
|
||||
keepWhitespaceAndCommentsInHead = true;
|
||||
hyphenReplacementInComment = "=";
|
||||
setPruneTags(null);
|
||||
setAllowTags(null);
|
||||
booleanAttributeValues = BOOL_ATT_SELF;
|
||||
charset = "UTF-8";
|
||||
cleanerTransformations.clear();
|
||||
resetPruneTagSet();
|
||||
if (this.getHtmlVersion()==HtmlCleaner.HTML_4){
|
||||
tagInfoProvider = Html4TagProvider.INSTANCE;
|
||||
}
|
||||
else{
|
||||
tagInfoProvider = Html5TagProvider.INSTANCE;
|
||||
}
|
||||
htmlModificationListeners = new ArrayList < HtmlModificationListener >();
|
||||
omitCdataOutsideScriptAndStyle = false;
|
||||
trimAttributeValues = true;
|
||||
invalidAttributeNamePrefix = "";
|
||||
allowInvalidAttributeNames = false;
|
||||
maxDepth = 1000;
|
||||
}
|
||||
|
||||
private void resetPruneTagSet() {
|
||||
pruneTagSet.clear();
|
||||
pruneTagSet.add(TagNodeAutoGeneratedCondition.INSTANCE);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the cleanerTransformations
|
||||
*/
|
||||
public CleanerTransformations getCleanerTransformations() {
|
||||
return cleanerTransformations;
|
||||
}
|
||||
|
||||
public void setCleanerTransformations(CleanerTransformations cleanerTransformations) {
|
||||
if ( cleanerTransformations == null ) {
|
||||
this.cleanerTransformations.clear();
|
||||
} else {
|
||||
this.cleanerTransformations = cleanerTransformations;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a listener to the list of objects that will be notified about changes that
|
||||
* cleaner does during cleanup process.
|
||||
*
|
||||
* @param listener -- listener object to be notified of the changes.
|
||||
*/
|
||||
public void addHtmlModificationListener(HtmlModificationListener listener){
|
||||
htmlModificationListeners.add(listener);
|
||||
}
|
||||
|
||||
public void fireConditionModification(ITagNodeCondition condition, TagNode tagNode) {
|
||||
for (HtmlModificationListener listener : htmlModificationListeners) {
|
||||
listener.fireConditionModification(condition, tagNode);
|
||||
}
|
||||
}
|
||||
|
||||
public void fireHtmlError(boolean certainty, TagNode startTagToken, ErrorType type) {
|
||||
for (HtmlModificationListener listener : htmlModificationListeners) {
|
||||
listener.fireHtmlError(certainty, startTagToken, type);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void fireUglyHtml(boolean certainty, TagNode startTagToken, ErrorType errorType) {
|
||||
for (HtmlModificationListener listener : htmlModificationListeners) {
|
||||
listener.fireUglyHtml(certainty, startTagToken, errorType);
|
||||
}
|
||||
}
|
||||
|
||||
public void fireUserDefinedModification(boolean certainty, TagNode tagNode, ErrorType errorType) {
|
||||
for (HtmlModificationListener listener : htmlModificationListeners) {
|
||||
listener.fireUserDefinedModification(certainty, tagNode, errorType);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the prefix to use to try to make valid attribute names
|
||||
* @return invalidAttributeNamePrefix
|
||||
*/
|
||||
public String getInvalidXmlAttributeNamePrefix() {
|
||||
return invalidAttributeNamePrefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the prefix to use for xml attributes that are invalid
|
||||
* @param invalidXmlAttributePrefix the prefix to use
|
||||
*/
|
||||
public void setInvalidXmlAttributeNamePrefix(
|
||||
String invalidXmlAttributePrefix) {
|
||||
this.invalidAttributeNamePrefix = invalidXmlAttributePrefix;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set whether to allow invalid attribute names, or to try to fix or omit them
|
||||
* @param allowInvalidAttributeNames True if invalid attributes allowed
|
||||
*/
|
||||
public void setAllowInvalidAttributeNames(
|
||||
boolean allowInvalidAttributeNames) {
|
||||
this.allowInvalidAttributeNames = allowInvalidAttributeNames;
|
||||
}
|
||||
|
||||
/**
|
||||
* If false, when outputting XML, if an attribute name is not valid, attempt to
|
||||
* fix it by using a prefix and removing invalid characters. Otherwise, omit invalid attributes
|
||||
* @return True if invalid attribute names are allowed.
|
||||
*/
|
||||
public boolean isAllowInvalidAttributeNames() {
|
||||
return allowInvalidAttributeNames;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,149 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Contains transformation collection.
|
||||
*/
|
||||
public class CleanerTransformations {
|
||||
|
||||
private Map mappings = new HashMap();
|
||||
private TagTransformation globalTransformations=new TagTransformation();
|
||||
|
||||
public CleanerTransformations() {
|
||||
|
||||
}
|
||||
/**
|
||||
* @param transInfos
|
||||
*/
|
||||
public CleanerTransformations(Map transInfos) {
|
||||
updateTagTransformations(transInfos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds specified tag transformation to the collection.
|
||||
* @param tagTransformation
|
||||
*/
|
||||
public void addTransformation(TagTransformation tagTransformation) {
|
||||
if (tagTransformation != null) {
|
||||
mappings.put( tagTransformation.getSourceTag(), tagTransformation );
|
||||
}
|
||||
}
|
||||
|
||||
public void addGlobalTransformation(AttributeTransformation attributeTransformation) {
|
||||
globalTransformations.addAttributePatternTransformation(attributeTransformation);
|
||||
}
|
||||
|
||||
public boolean hasTransformationForTag(String tagName) {
|
||||
return tagName != null && mappings.containsKey(tagName.toLowerCase());
|
||||
}
|
||||
|
||||
public TagTransformation getTransformation(String tagName) {
|
||||
return tagName != null ? (TagTransformation) mappings.get(tagName.toLowerCase()) : null;
|
||||
}
|
||||
|
||||
public void updateTagTransformations(String key, String value) {
|
||||
int index = key.indexOf('.');
|
||||
|
||||
// new tag transformation case (tagname[=destname[,preserveatts]])
|
||||
if (index <= 0) {
|
||||
String destTag = null;
|
||||
boolean preserveSourceAtts = true;
|
||||
if (value != null) {
|
||||
String[] tokens = Utils.tokenize(value, ",;");
|
||||
if (tokens.length > 0) {
|
||||
destTag = tokens[0];
|
||||
}
|
||||
if (tokens.length > 1) {
|
||||
preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) ||
|
||||
"yes".equalsIgnoreCase(tokens[1]) ||
|
||||
"1".equals(tokens[1]);
|
||||
}
|
||||
}
|
||||
TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts);
|
||||
addTransformation(newTagTrans);
|
||||
} else { // attribute transformation description
|
||||
String[] parts = Utils.tokenize(key, ".");
|
||||
String tagName = parts[0];
|
||||
TagTransformation trans = getTransformation(tagName);
|
||||
if (trans != null) {
|
||||
trans.addAttributeTransformation(parts[1], value);
|
||||
}
|
||||
}
|
||||
}
|
||||
public void updateTagTransformations(Map transInfos) {
|
||||
Iterator iterator = transInfos.entrySet().iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Map.Entry entry = (Map.Entry) iterator.next();
|
||||
String tag = (String) entry.getKey();
|
||||
String value = (String) entry.getValue();
|
||||
updateTagTransformations(tag, value);
|
||||
}
|
||||
}
|
||||
public Map<String, String> transformAttributes(String originalTagName, Map<String, String> attributes) {
|
||||
TagTransformation tagTrans = getTransformation(originalTagName);
|
||||
Map<String, String> results;
|
||||
if ( tagTrans != null ) {
|
||||
results = tagTrans.applyTagTransformations(attributes);
|
||||
} else {
|
||||
results = attributes;
|
||||
}
|
||||
return this.globalTransformations.applyTagTransformations(results);
|
||||
}
|
||||
|
||||
public String getTagName(String tagName) {
|
||||
TagTransformation tagTransformation = null;
|
||||
if (hasTransformationForTag(tagName)) {
|
||||
tagTransformation = getTransformation(tagName);
|
||||
if (tagTransformation != null) {
|
||||
return tagTransformation.getDestTag();
|
||||
}
|
||||
}
|
||||
return tagName;
|
||||
}
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public void clear() {
|
||||
this.mappings.clear();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/*
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
|
||||
Additional work by Amplafi. -- All rights released.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* @author patmoore
|
||||
*
|
||||
*/
|
||||
public enum CloseTag {
|
||||
/**
|
||||
* <div></div> is required. Minimizing to <div/> is not permitted.
|
||||
*/
|
||||
required(false, true),
|
||||
/**
|
||||
* <hr> or <hr/> is permitted
|
||||
*/
|
||||
optional(true, true),
|
||||
/**
|
||||
* <img/> is not permitted
|
||||
*/
|
||||
forbidden(true, false);
|
||||
private final boolean minimizedTagPermitted;
|
||||
private final boolean endTagPermitted;
|
||||
/**
|
||||
*
|
||||
* @param minimizedTagPermitted if true tag can be reduced to <x/>
|
||||
* @param endTagPermitted TODO
|
||||
*/
|
||||
private CloseTag(boolean minimizedTagPermitted, boolean endTagPermitted) {
|
||||
this.minimizedTagPermitted = minimizedTagPermitted;
|
||||
this.endTagPermitted =endTagPermitted;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if <x/> form is allowed
|
||||
*/
|
||||
public boolean isMinimizedTagPermitted() {
|
||||
return this.minimizedTagPermitted;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if <x/> or </x> is permitted.
|
||||
*/
|
||||
public boolean isEndTagPermitted() {
|
||||
return endTagPermitted;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,384 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
import java.util.Scanner;
|
||||
import java.util.TreeMap;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.htmlcleaner.audit.HtmlModificationListenerLogger;
|
||||
|
||||
/**
|
||||
* <p>Command line usage class.</p>
|
||||
*/
|
||||
public class CommandLine {
|
||||
|
||||
private static final String OMITXMLDECL = "omitxmldecl";
|
||||
|
||||
/**
|
||||
* If the specified argument name exists without a value, return true.
|
||||
* If it exists with a value, translate it as a boolean.
|
||||
* @param args the command line arguments
|
||||
* @param name the switch name
|
||||
* @return true, or false, depending on whether the switch has been specified
|
||||
*/
|
||||
private static boolean getSwitchArgument(String[] args, String name){
|
||||
boolean value = false;
|
||||
for (String curr : args){
|
||||
int eqIndex = curr.indexOf('=');
|
||||
if (eqIndex >= 0) {
|
||||
String argName = curr.substring(0, eqIndex).trim();
|
||||
String argValue = curr.substring(eqIndex+1).trim();
|
||||
if (argName.toLowerCase().startsWith(name.toLowerCase())) {
|
||||
value = toBoolean(argValue);
|
||||
}
|
||||
} else {
|
||||
value = true;
|
||||
}
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
private static String getArgValue(String[] args, String name, String defaultValue) {
|
||||
for (String curr : args) {
|
||||
int eqIndex = curr.indexOf('=');
|
||||
if (eqIndex >= 0) {
|
||||
String argName = curr.substring(0, eqIndex).trim();
|
||||
String argValue = curr.substring(eqIndex+1).trim();
|
||||
|
||||
if (argName.toLowerCase().startsWith(name.toLowerCase())) {
|
||||
return argValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
private static boolean toBoolean(String s) {
|
||||
return s != null && ( "on".equalsIgnoreCase(s) || "true".equalsIgnoreCase(s) || "yes".equalsIgnoreCase(s) );
|
||||
}
|
||||
|
||||
private final static String className = CommandLine.class.getName();
|
||||
private final static Logger logger = Logger.getLogger(className);
|
||||
|
||||
public static void main(String[] args) throws IOException, XPatherException {
|
||||
String source = getArgValue(args, "src", "");
|
||||
Scanner scan = new Scanner(System.in);
|
||||
String s = "";
|
||||
|
||||
if ( "".equals(source) ) {
|
||||
while (scan.hasNext()) {
|
||||
s += scan.nextLine();
|
||||
}
|
||||
if (s.compareTo("") != 0) {
|
||||
System.err.println("Output:");
|
||||
} else {
|
||||
System.err.println("Usage: java -jar htmlcleanerXX.jar src=<url | file> [htmlver=4] [incharset=<charset>] " +
|
||||
"[dest=<file>] [outcharset=<charset>] [taginfofile=<file>] [options...]");
|
||||
System.err.println("Alternative: java -jar htmlcleanerXX.jar (reads the input from console)");
|
||||
System.err.println("");
|
||||
System.err.println("where options include:");
|
||||
System.err.println(" outputtype=simple* | compact | browser-compact | pretty");
|
||||
System.err.println(" advancedxmlescape=true* | false");
|
||||
System.err.println(" usecdata=true* | false");
|
||||
System.err.println(" usecdatafor=<string value> [script,style]");
|
||||
System.err.println(" specialentities=true* | false");
|
||||
System.err.println(" unicodechars=true* | false");
|
||||
System.err.println(" omitunknowntags=true | false*");
|
||||
System.err.println(" treatunknowntagsascontent=true | false*");
|
||||
System.err.println(" omitdeprtags=true | false*");
|
||||
System.err.println(" treatdeprtagsascontent=true | false*");
|
||||
System.err.println(" omitcomments=true | false*");
|
||||
System.err.println(" " +OMITXMLDECL +"=true* | false");
|
||||
System.err.println(" omitdoctypedecl=true* | false");
|
||||
System.err.println(" omithtmlenvelope=true | false*");
|
||||
System.err.println(" useemptyelementtags=true* | false");
|
||||
System.err.println(" allowmultiwordattributes=true* | false");
|
||||
System.err.println(" allowhtmlinsideattributes=true | false*");
|
||||
System.err.println(" ignoreqe=true | false*");
|
||||
System.err.println(" namespacesaware=true* | false");
|
||||
System.err.println(" hyphenreplacement=<string value> [=]");
|
||||
System.err.println(" prunetags=<string value> []");
|
||||
System.err.println(" booleanatts=self* | empty | true");
|
||||
System.err.println(" nodebyxpath=<xpath expression>");
|
||||
System.err.println(" allowinvalidxmlattributenames=true | false*");
|
||||
System.err.println(" invalidxmlattributenameprefix=<string value> []");
|
||||
System.err.println(" t:<sourcetagX>[=<desttag>[,<preserveatts>]]");
|
||||
System.err.println(" t:<sourcetagX>.<destattrY>[=<template>]");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
String inCharset = getArgValue(args, "incharset", "");
|
||||
if ("".equals(inCharset)) {
|
||||
inCharset = CleanerProperties.DEFAULT_CHARSET;
|
||||
}
|
||||
|
||||
String outCharset = getArgValue(args, "outcharset", "");
|
||||
if ("".equals(outCharset)) {
|
||||
outCharset = CleanerProperties.DEFAULT_CHARSET;
|
||||
}
|
||||
|
||||
String htmlversion = getArgValue(args, "htmlver", "");
|
||||
String destination = getArgValue(args, "dest", "");
|
||||
String outputType = getArgValue(args, "outputtype", "");
|
||||
String advancedXmlEscape = getArgValue(args, "advancedxmlescape", "");
|
||||
String useCData = getArgValue(args, "usecdata", "");
|
||||
String useCDataFor = getArgValue(args, "usecdatafor", "");
|
||||
String translateSpecialEntities = getArgValue(args, "specialentities", "");
|
||||
String unicodeChars = getArgValue(args, "unicodechars", "");
|
||||
String omitUnknownTags = getArgValue(args, "omitunknowntags", "");
|
||||
String treatUnknownTagsAsContent = getArgValue(args, "treatunknowntagsascontent", "");
|
||||
String omitDeprecatedTags = getArgValue(args, "omitdeprtags", "");
|
||||
String treatDeprecatedTagsAsContent = getArgValue(args, "treatdeprtagsascontent", "");
|
||||
String omitComments = getArgValue(args, "omitcomments", "");
|
||||
String omitXmlDeclaration = getArgValue(args, OMITXMLDECL, "");
|
||||
String omitDoctypeDeclaration = getArgValue(args, "omitdoctypedecl", "");
|
||||
String omitHtmlEnvelope = getArgValue(args, "omithtmlenvelope", "");
|
||||
String useEmptyElementTags = getArgValue(args, "useemptyelementtags", "");
|
||||
String allowMultiWordAttributes = getArgValue(args, "allowmultiwordattributes", "");
|
||||
String allowHtmlInsideAttributes = getArgValue(args, "allowhtmlinsideattributes", "");
|
||||
String ignoreQuestAndExclam = getArgValue(args, "ignoreqe", "");
|
||||
String namespacesAware= getArgValue(args, "namespacesaware", "");
|
||||
String commentHyphen = getArgValue(args, "hyphenreplacement", "");
|
||||
String pruneTags = getArgValue(args, "prunetags", "");
|
||||
String booleanAtts = getArgValue(args, "booleanatts", "");
|
||||
String nodeByXPath = getArgValue(args, "nodebyxpath", "");
|
||||
|
||||
String allowInvalidAttributeNames = getArgValue(args, "allowinvalidattributenames", "");
|
||||
String invalidXmlAttributeNamePrefix = getArgValue(args, "invalidxmlattributenameprefix", "");
|
||||
|
||||
HtmlCleaner cleaner;
|
||||
|
||||
String tagInfoFile = getArgValue(args, "taginfofile", "");
|
||||
if ( !"".equals(tagInfoFile) ) {
|
||||
cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(tagInfoFile)));
|
||||
} else { //Set appropriate TagProvider
|
||||
if (htmlversion.compareTo("4")==0)
|
||||
cleaner = new HtmlCleaner(Html4TagProvider.INSTANCE);
|
||||
else
|
||||
cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE);
|
||||
}
|
||||
|
||||
final CleanerProperties props = cleaner.getProperties();
|
||||
|
||||
//
|
||||
// If the user specifies "quiet" or "quiet=true" then we don't add a modification
|
||||
// listener
|
||||
//
|
||||
if (!getSwitchArgument(args, "quiet"))
|
||||
props.addHtmlModificationListener(new HtmlModificationListenerLogger(logger));
|
||||
|
||||
if ( !"".equals(omitUnknownTags) ) {
|
||||
props.setOmitUnknownTags( toBoolean(omitUnknownTags) );
|
||||
}
|
||||
|
||||
if ( !"".equals(treatUnknownTagsAsContent) ) {
|
||||
props.setTreatUnknownTagsAsContent( toBoolean(treatUnknownTagsAsContent) );
|
||||
}
|
||||
|
||||
if ( !"".equals(omitDeprecatedTags) ) {
|
||||
props.setOmitDeprecatedTags( toBoolean(omitDeprecatedTags) );
|
||||
}
|
||||
|
||||
if ( !"".equals(treatDeprecatedTagsAsContent) ) {
|
||||
props.setTreatDeprecatedTagsAsContent( toBoolean(treatDeprecatedTagsAsContent) );
|
||||
}
|
||||
|
||||
if ( !"".equals(advancedXmlEscape) ) {
|
||||
props.setAdvancedXmlEscape( toBoolean(advancedXmlEscape) );
|
||||
}
|
||||
|
||||
if ( !"".equals(useCData) && "".equals(useCDataFor) ) {
|
||||
props.setUseCdataForScriptAndStyle( toBoolean(useCData) );
|
||||
}
|
||||
|
||||
if ( !"".equals(useCDataFor) ) {
|
||||
props.setUseCdataFor( useCDataFor );
|
||||
}
|
||||
|
||||
if ( !"".equals(translateSpecialEntities) ) {
|
||||
props.setTranslateSpecialEntities( toBoolean(translateSpecialEntities) );
|
||||
}
|
||||
|
||||
if ( !"".equals(unicodeChars) ) {
|
||||
props.setRecognizeUnicodeChars( toBoolean(unicodeChars) );
|
||||
}
|
||||
|
||||
if ( !"".equals(omitComments) ) {
|
||||
props.setOmitComments( toBoolean(omitComments) );
|
||||
}
|
||||
|
||||
if ( !"".equals(omitXmlDeclaration) ) {
|
||||
props.setOmitXmlDeclaration( toBoolean(omitXmlDeclaration) );
|
||||
}
|
||||
|
||||
if ( !"".equals(omitDoctypeDeclaration) ) {
|
||||
props.setOmitDoctypeDeclaration( toBoolean(omitDoctypeDeclaration) );
|
||||
}
|
||||
|
||||
if ( !"".equals(omitHtmlEnvelope) ) {
|
||||
props.setOmitHtmlEnvelope( toBoolean(omitHtmlEnvelope) );
|
||||
}
|
||||
|
||||
if ( !"".equals(useEmptyElementTags) ) {
|
||||
props.setUseEmptyElementTags( toBoolean(useEmptyElementTags) );
|
||||
}
|
||||
|
||||
if ( !"".equals(allowMultiWordAttributes) ) {
|
||||
props.setAllowMultiWordAttributes( toBoolean(allowMultiWordAttributes) );
|
||||
}
|
||||
|
||||
if ( !"".equals(allowHtmlInsideAttributes) ) {
|
||||
props.setAllowHtmlInsideAttributes( toBoolean(allowHtmlInsideAttributes) );
|
||||
}
|
||||
|
||||
if ( !"".equals(ignoreQuestAndExclam) ) {
|
||||
props.setIgnoreQuestAndExclam( toBoolean(ignoreQuestAndExclam) );
|
||||
}
|
||||
|
||||
if ( !"".equals(namespacesAware) ) {
|
||||
props.setNamespacesAware( toBoolean(namespacesAware) );
|
||||
}
|
||||
|
||||
if ( !"".equals(commentHyphen) ) {
|
||||
props.setHyphenReplacementInComment(commentHyphen);
|
||||
}
|
||||
|
||||
if ( !"".equals(pruneTags) ) {
|
||||
props.setPruneTags(pruneTags);
|
||||
}
|
||||
|
||||
if ( !"".equals(booleanAtts) ) {
|
||||
props.setBooleanAttributeValues(booleanAtts);
|
||||
}
|
||||
|
||||
if ( !"".equals(allowInvalidAttributeNames) ) {
|
||||
props.setAllowInvalidAttributeNames( toBoolean(allowInvalidAttributeNames) );
|
||||
}
|
||||
|
||||
if ( !"".equals(invalidXmlAttributeNamePrefix) ) {
|
||||
props.setInvalidXmlAttributeNamePrefix( invalidXmlAttributeNamePrefix );
|
||||
}
|
||||
|
||||
|
||||
// collect transformation info
|
||||
Map transInfos = new TreeMap();
|
||||
for (String arg2 : args) {
|
||||
String arg = arg2;
|
||||
if (arg.startsWith("t:") && arg.length() > 2) {
|
||||
arg = arg.substring(2);
|
||||
int index = arg.indexOf('=');
|
||||
String key = index <= 0 ? arg : arg.substring(0, index);
|
||||
String value = index <= 0 ? null : arg.substring(index + 1);
|
||||
transInfos.put(key, value);
|
||||
}
|
||||
}
|
||||
cleaner.initCleanerTransformations(transInfos);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
TagNode node;
|
||||
|
||||
String src = source.toLowerCase();
|
||||
|
||||
if (src.startsWith("http://") || src.startsWith("https://")) {
|
||||
node = cleaner.clean(new URL(src), inCharset);
|
||||
} else if (!source.isEmpty()) {
|
||||
node = cleaner.clean(new File(source), inCharset);
|
||||
} else {
|
||||
node = cleaner.clean(s);
|
||||
}
|
||||
|
||||
// if user specifies XPath expresssion to choose node for serialization, then
|
||||
// try to evaluate XPath and look for first TagNode instance in the resulting array
|
||||
if ( !"".equals(nodeByXPath) ) {
|
||||
final Object[] xpathResult = node.evaluateXPath(nodeByXPath);
|
||||
int i;
|
||||
for (i = 0; i < xpathResult.length; i++) {
|
||||
if ( xpathResult[i] instanceof TagNode ) {
|
||||
node = (TagNode) xpathResult[i];
|
||||
System.out.println("Node successfully found by XPath.");
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i == xpathResult.length) {
|
||||
System.out.println("Node not found by XPath expression - whole html tree is going to be serialized!");
|
||||
}
|
||||
}
|
||||
|
||||
OutputStream out;
|
||||
if ( destination == null || "".equals(destination.trim()) ) {
|
||||
out = System.out;
|
||||
} else {
|
||||
out = new FileOutputStream(destination);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if ( "compact".equals(outputType) ) {
|
||||
new CompactXmlSerializer(props).writeToStream(node, out, outCharset);
|
||||
} else if ( "browser-compact".equals(outputType) ) {
|
||||
new BrowserCompactXmlSerializer(props).writeToStream(node, out, outCharset);
|
||||
} else if ( "pretty".equals(outputType) ) {
|
||||
new PrettyXmlSerializer(props).writeToStream(node, out, outCharset);
|
||||
} else if ( "htmlsimple".equals(outputType) ) {
|
||||
new SimpleHtmlSerializer(props).writeToStream(node, out, outCharset);
|
||||
} else if ( "htmlpretty".equals(outputType) ) {
|
||||
new PrettyHtmlSerializer(props).writeToStream(node, out, outCharset);
|
||||
} else if ( "htmlcompact".equals(outputType) ) {
|
||||
new CompactHtmlSerializer(props).writeToStream(node, out, outCharset);
|
||||
} else {
|
||||
new SimpleXmlSerializer(props).writeToStream(node, out, outCharset);
|
||||
}
|
||||
|
||||
if (!getSwitchArgument(args, "quiet")){
|
||||
System.out.println("Finished successfully in " + (System.currentTimeMillis() - start)+ "ms." );
|
||||
}
|
||||
|
||||
scan.close();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
|
||||
/**
|
||||
* <p>HTML comment token.</p>
|
||||
*/
|
||||
public class CommentNode extends BaseHtmlNode implements HtmlNode {
|
||||
|
||||
private String content;
|
||||
|
||||
public CommentNode(String content) {
|
||||
this.content = content;
|
||||
}
|
||||
|
||||
public String getCommentedContent() {
|
||||
return "<!--" + content + "-->";
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getCommentedContent();
|
||||
}
|
||||
|
||||
public void serialize(Serializer serializer, Writer writer) throws IOException {
|
||||
writer.write( getCommentedContent() );
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
/* Copyright (c) 2006-20013, HtmlCleaner project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Compact HTML serializer - creates resulting HTML by stripping whitespaces wherever possible.</p>
|
||||
*/
|
||||
public class CompactHtmlSerializer extends HtmlSerializer {
|
||||
|
||||
private int openPreTags = 0;
|
||||
|
||||
public CompactHtmlSerializer(CleanerProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
|
||||
boolean isPreTag = "pre".equalsIgnoreCase(tagNode.getName());
|
||||
if (isPreTag) {
|
||||
openPreTags++;
|
||||
}
|
||||
|
||||
serializeOpenTag(tagNode, writer, false);
|
||||
|
||||
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
|
||||
if ( !isMinimizedTagSyntax(tagNode) ) {
|
||||
ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
|
||||
while ( childrenIt.hasNext() ) {
|
||||
Object item = childrenIt.next();
|
||||
if (item instanceof ContentNode) {
|
||||
String content = item.toString();
|
||||
if (openPreTags > 0) {
|
||||
writer.write(content);
|
||||
} else {
|
||||
boolean startsWithSpace = content.length() > 0 && Character.isWhitespace( content.charAt(0) );
|
||||
boolean endsWithSpace = content.length() > 1 && Character.isWhitespace( content.charAt(content.length() - 1) );
|
||||
content = dontEscape(tagNode) ? content.trim() : escapeText(content.trim());
|
||||
|
||||
if (startsWithSpace) {
|
||||
writer.write(' ');
|
||||
}
|
||||
|
||||
if (content.length() != 0) {
|
||||
writer.write(content);
|
||||
if (endsWithSpace) {
|
||||
writer.write(' ');
|
||||
}
|
||||
}
|
||||
|
||||
//Removed due to issue #199
|
||||
//if (childrenIt.hasNext()) {
|
||||
// if ( !Utils.isWhitespaceString(childrenIt.next()) ) {
|
||||
// writer.write("\n");
|
||||
// }
|
||||
// childrenIt.previous();
|
||||
//}
|
||||
|
||||
}
|
||||
} else if (item instanceof CommentNode) {
|
||||
String content = ((CommentNode) item).getCommentedContent().trim();
|
||||
writer.write(content);
|
||||
} else if (item instanceof BaseToken) {
|
||||
((BaseToken)item).serialize(this, writer);
|
||||
}
|
||||
}
|
||||
|
||||
serializeEndTag(tagNode, writer, false);
|
||||
if (isPreTag) {
|
||||
openPreTags--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Compact XML serializer - creates resulting XML by stripping whitespaces.</p>
|
||||
*/
|
||||
public class CompactXmlSerializer extends XmlSerializer {
|
||||
|
||||
public CompactXmlSerializer(CleanerProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializeOpenTag(tagNode, writer, false);
|
||||
|
||||
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
|
||||
if ( !isMinimizedTagSyntax(tagNode) ) {
|
||||
ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
|
||||
while ( childrenIt.hasNext() ) {
|
||||
Object item = childrenIt.next();
|
||||
if (item != null) {
|
||||
if ( item instanceof ContentNode ) {
|
||||
String content = ((ContentNode) item).getContent().trim();
|
||||
writer.write( dontEscape(tagNode) ? content.replaceAll("]]>", "]]>") : escapeXml(content) );
|
||||
|
||||
if (childrenIt.hasNext()) {
|
||||
if ( !isWhitespaceString(childrenIt.next()) ) {
|
||||
writer.write("\n");
|
||||
}
|
||||
childrenIt.previous();
|
||||
}
|
||||
} else if (item instanceof CommentNode) {
|
||||
String content = ((CommentNode) item).getCommentedContent().trim();
|
||||
writer.write(content);
|
||||
} else {
|
||||
((BaseToken)item).serialize(this, writer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
serializeEndTag(tagNode, writer, false);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether specified object's string representation is empty string (containing of only whitespaces).
|
||||
* @param object Object whose string representation is checked
|
||||
* @return true, if empty string, false otherwise
|
||||
*/
|
||||
private boolean isWhitespaceString(Object object) {
|
||||
if (object != null) {
|
||||
String s = object.toString();
|
||||
return s != null && "".equals(s.trim());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,257 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
import java.io.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.net.URL;
|
||||
|
||||
/**
|
||||
* Configuration file tag provider - reads XML file in specified format and creates a Tag Provider.
|
||||
* Used to create custom tag providers when used on the command line.
|
||||
*/
|
||||
public class ConfigFileTagProvider extends HashMap implements ITagInfoProvider {
|
||||
|
||||
// obtaining instance of the SAX parser factory
|
||||
static SAXParserFactory parserFactory = SAXParserFactory.newInstance();
|
||||
static {
|
||||
parserFactory.setValidating(false);
|
||||
parserFactory.setNamespaceAware(false);
|
||||
}
|
||||
|
||||
// tells whether to generate code of the tag provider class based on XML configuration file
|
||||
// to the standard output
|
||||
private boolean generateCode = false;
|
||||
|
||||
private ConfigFileTagProvider() {
|
||||
}
|
||||
|
||||
public ConfigFileTagProvider(InputSource inputSource) {
|
||||
try {
|
||||
new ConfigParser(this).parse(inputSource);
|
||||
} catch (Exception e) {
|
||||
throw new HtmlCleanerException("Error parsing tag configuration file!", e);
|
||||
}
|
||||
}
|
||||
|
||||
public ConfigFileTagProvider(File file) {
|
||||
try {
|
||||
new ConfigParser(this).parse(new InputSource(new FileReader(file)));
|
||||
} catch (Exception e) {
|
||||
throw new HtmlCleanerException("Error parsing tag configuration file!", e);
|
||||
}
|
||||
}
|
||||
|
||||
public ConfigFileTagProvider(URL url) {
|
||||
try {
|
||||
Object content = url.getContent();
|
||||
if (content instanceof InputStream) {
|
||||
InputStreamReader reader = new InputStreamReader((InputStream)content);
|
||||
new ConfigParser(this).parse(new InputSource(reader));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HtmlCleanerException("Error parsing tag configuration file!", e);
|
||||
}
|
||||
}
|
||||
|
||||
public TagInfo getTagInfo(String tagName) {
|
||||
return (TagInfo) get(tagName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates code for tag provider class from specified configuration XML file.
|
||||
* In order to create custom tag info provider, make config file and call this main method
|
||||
* with the specified file. Output will be generated on the standard output. This way a custom
|
||||
* tag provider (class CustomTagProvider) is generated from an XML file. An example XML file,
|
||||
* "example.xml", can be found in the source distribution.
|
||||
*
|
||||
* @param args
|
||||
* @throws IOException
|
||||
* @throws SAXException
|
||||
* @throws ParserConfigurationException
|
||||
*/
|
||||
public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
|
||||
final ConfigFileTagProvider provider = new ConfigFileTagProvider();
|
||||
provider.generateCode = true;
|
||||
|
||||
String fileName = "default.xml";
|
||||
if (args != null && args.length>0){
|
||||
fileName = args[0];
|
||||
}
|
||||
|
||||
File configFile = new File(fileName);
|
||||
String packagePath = "org.htmlcleaner";
|
||||
String className = "CustomTagProvider";
|
||||
|
||||
final ConfigParser parser = provider.new ConfigParser(provider);
|
||||
System.out.println("package " + packagePath + ";");
|
||||
System.out.println("import java.util.HashMap;");
|
||||
System.out.println("public class " + className + " extends HashMap implements ITagInfoProvider {");
|
||||
System.out.println("private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();");
|
||||
System.out.println("// singleton instance, used if no other TagInfoProvider is specified");
|
||||
System.out.println("public final static "+className+" INSTANCE= new "+className+"();");
|
||||
System.out.println("public " + className + "() {");
|
||||
System.out.println("TagInfo tagInfo;");
|
||||
parser.parse( new InputSource(new FileReader(configFile)) );
|
||||
System.out.println("}");
|
||||
System.out.println("}");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* SAX parser for tag configuration files.
|
||||
*/
|
||||
private class ConfigParser extends DefaultHandler {
|
||||
private TagInfo tagInfo = null;
|
||||
private String dependencyName = null;
|
||||
private Map tagInfoMap;
|
||||
|
||||
ConfigParser(Map tagInfoMap) {
|
||||
this.tagInfoMap = tagInfoMap;
|
||||
}
|
||||
|
||||
public void parse(InputSource in) throws ParserConfigurationException, SAXException, IOException {
|
||||
SAXParser parser = parserFactory.newSAXParser();
|
||||
parser.parse(in, this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||
if (tagInfo != null) {
|
||||
String value = new String(ch, start, length).trim();
|
||||
if ( "fatal-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineFatalTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineFatalTags(\"" + value + "\");");
|
||||
}
|
||||
} else if ( "req-enclosing-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineRequiredEnclosingTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineRequiredEnclosingTags(\"" + value + "\");");
|
||||
}
|
||||
} else if ( "forbidden-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineForbiddenTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineForbiddenTags(\"" + value + "\");");
|
||||
}
|
||||
} else if ( "allowed-children-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineAllowedChildrenTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineAllowedChildrenTags(\"" + value + "\");");
|
||||
}
|
||||
} else if ( "higher-level-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineHigherLevelTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineHigherLevelTags(\"" + value + "\");");
|
||||
}
|
||||
} else if ( "close-before-copy-inside-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineCloseBeforeCopyInsideTags(\"" + value + "\");");
|
||||
}
|
||||
} else if ( "close-inside-copy-after-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineCloseInsideCopyAfterTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineCloseInsideCopyAfterTags(\"" + value + "\");");
|
||||
}
|
||||
} else if ( "close-before-tags".equals(dependencyName) ) {
|
||||
tagInfo.defineCloseBeforeTags(value);
|
||||
if (generateCode) {
|
||||
System.out.println("tagInfo.defineCloseBeforeTags(\"" + value + "\");");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
||||
if ( "tag".equals(qName) ) {
|
||||
String name = attributes.getValue("name");
|
||||
String content = attributes.getValue("content");
|
||||
String section = attributes.getValue("section");
|
||||
String deprecated = attributes.getValue("deprecated");
|
||||
String unique = attributes.getValue("unique");
|
||||
String ignorePermitted = attributes.getValue("ignore-permitted");
|
||||
ContentType contentType = ContentType.toValue(content);
|
||||
BelongsTo belongsTo = BelongsTo.toValue(section);
|
||||
tagInfo = new TagInfo(name, contentType,
|
||||
belongsTo,
|
||||
deprecated != null && "true".equals(deprecated),
|
||||
unique != null && "true".equals(unique),
|
||||
ignorePermitted != null && "true".equals(ignorePermitted), CloseTag.required, Display.any );
|
||||
if (generateCode) {
|
||||
String s = "tagInfo = new TagInfo(\"#1\", #2, #3, #4, #5, #6);";
|
||||
s = s.replaceAll("#1", name);
|
||||
s = s.replaceAll("#2", ContentType.class.getCanonicalName()+"."+contentType.name());
|
||||
s = s.replaceAll("#3", BelongsTo.class.getCanonicalName()+"."+belongsTo.name());
|
||||
s = s.replaceAll("#4", Boolean.toString(deprecated != null && "true".equals(deprecated)));
|
||||
s = s.replaceAll("#5", Boolean.toString(unique != null && "true".equals(unique)));
|
||||
s = s.replaceAll("#6", Boolean.toString(ignorePermitted != null && "true".equals(ignorePermitted)));
|
||||
System.out.println(s);
|
||||
}
|
||||
} else if ( !"tags".equals(qName) ) {
|
||||
dependencyName = qName;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String qName) throws SAXException {
|
||||
if ( "tag".equals(qName) ) {
|
||||
if (tagInfo != null) {
|
||||
tagInfoMap.put(tagInfo.getName(), tagInfo);
|
||||
if (generateCode) {
|
||||
System.out.println("this.put(\"" + tagInfo.getName() + "\", tagInfo);\n");
|
||||
}
|
||||
}
|
||||
tagInfo = null;
|
||||
} else if ( !"tags".equals(qName) ) {
|
||||
dependencyName = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
|
||||
/**
|
||||
* <p>HTML text token.</p>
|
||||
*/
|
||||
public class ContentNode extends BaseHtmlNode implements HtmlNode {
|
||||
|
||||
protected final String content;
|
||||
protected final boolean blank;
|
||||
|
||||
public ContentNode(String content) {
|
||||
this.content = content;
|
||||
this.blank = Utils.isEmptyString(this.content);
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
return content;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getContent();
|
||||
}
|
||||
|
||||
public void serialize(Serializer serializer, Writer writer) throws IOException {
|
||||
writer.write( getContent() );
|
||||
}
|
||||
|
||||
public boolean isBlank() {
|
||||
return this.blank;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,76 @@
|
||||
/*
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* @author patmoore
|
||||
*
|
||||
*/
|
||||
public enum ContentType {
|
||||
all("all"),
|
||||
/**
|
||||
* elements that have no children or content ( for example <img> ). For these elements, the check for null elements must be more than must a children/ content check.
|
||||
*/
|
||||
none("none"),
|
||||
text("text");
|
||||
private final String dbCode;
|
||||
private ContentType(String dbCode) {
|
||||
this.dbCode =dbCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the dbCode
|
||||
*/
|
||||
public String getDbCode() {
|
||||
return dbCode;
|
||||
}
|
||||
|
||||
public static ContentType toValue(Object value) {
|
||||
ContentType result = null;
|
||||
if ( value instanceof ContentType) {
|
||||
result = (ContentType) value;
|
||||
} else if ( value != null ) {
|
||||
String dbCode = value.toString().trim();
|
||||
for(ContentType contentType: ContentType.values()) {
|
||||
if ( contentType.getDbCode().equalsIgnoreCase(dbCode) || contentType.name().equalsIgnoreCase(dbCode)) {
|
||||
result = contentType;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,645 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
/**
|
||||
* This is the default tag provider for HTML Cleaner
|
||||
* Note this is no longer generated from XML - see https://sourceforge.net/p/htmlcleaner/bugs/81/
|
||||
*/
|
||||
public class DefaultTagProvider implements ITagInfoProvider {
|
||||
|
||||
private static final String STRONG = "strong";
|
||||
private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
|
||||
// singleton instance, used if no other TagInfoProvider is specified
|
||||
public final static DefaultTagProvider INSTANCE= new DefaultTagProvider();
|
||||
|
||||
private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo,"+STRONG+",em,q,b,i,u,tt,sub,sup,big,small,strike,s,font";
|
||||
private static final String CLOSE_BEFORE_TAGS = "h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
|
||||
|
||||
/**
|
||||
* Phrasing tags are those that can make up paragraphs along with text to make Phrasing Content
|
||||
*/
|
||||
private static final String PHRASING_TAGS = "a,abbr,area,audio,b,bdi,bdo,br,button,canvas,cite,code,data,datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,keygen,label,link,map,mark,math,meta,meter,noscript,object,output,progress,q,ruby,s,samp,script,select,small,span,strong,sub,sup,svg,template,textarea,time,u,var,video,wbr";
|
||||
|
||||
/**
|
||||
* HTML5 Media Tags
|
||||
*/
|
||||
private static final String MEDIA_TAGS = "audio,video";
|
||||
|
||||
public DefaultTagProvider() {
|
||||
TagInfo tagInfo;
|
||||
|
||||
|
||||
// private static final Set<String> END_TAG_OPTIONAL = Collections.unmodifiableSet(new HashSet(Arrays.asList(
|
||||
// "thead", "dt", "body", "tr", "colgroup", "td", "tfoot", "th", "li", "dd", "tbody", "p", "html", "head", "option")));
|
||||
// private static final Set<String> END_TAG_FORBIDDEN = Collections.unmodifiableSet(new HashSet(Arrays.asList(
|
||||
// "hr", "col", "param", "link", "img", "br", "meta", "input", "frame", "area", "basefont", "base", "isindex")));
|
||||
// private static final Set<String> END_TAG_REQUIRED = Collections.unmodifiableSet(new HashSet(Arrays.asList(
|
||||
// "noscript", "kbd", "center", "button", "h5", "h4", "samp", "ol", "h6", "h1", "h3", "h2", "form", "select",
|
||||
// "font", "menu", "ins",
|
||||
// "abbr", "label", "table", "code", "script", "cite", "iframe", "strong", "textarea", "noframes", "big",
|
||||
// "small", "span", "sub", "optgroup", "bdo", "var", "div", "object", "sup", "title", "strike", "style",
|
||||
// "dir", "map", "applet", "dl", "del", "fieldset", "ul", "b", "acronym", "a", "blockquote",
|
||||
// "caption", "i", "u", "s", "frameset", "tt", "address", "q", "pre", "legend", "em", "dfn")));
|
||||
tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("div", tagInfo);
|
||||
|
||||
/**
|
||||
* The HTML5 semantic flow tags
|
||||
*/
|
||||
|
||||
// Sectioning tags
|
||||
tagInfo = new TagInfo("aside", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("aside", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("section", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("section", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("article", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("article", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("main", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("main", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("nav", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("nav", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("details", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("details", tagInfo);
|
||||
tagInfo = new TagInfo("summary", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineRequiredEnclosingTags("details");
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("summary", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("figure", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p");
|
||||
this.put("figure", tagInfo);
|
||||
tagInfo = new TagInfo("figcaption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineRequiredEnclosingTags("figure");
|
||||
this.put("figcaption", tagInfo);
|
||||
|
||||
// header and footer
|
||||
tagInfo = new TagInfo("header", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,header,footer,main");
|
||||
this.put("header", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("footer", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,header,footer,main");
|
||||
this.put("footer", tagInfo);
|
||||
|
||||
/**
|
||||
* Html5 phrasing tags
|
||||
*/
|
||||
tagInfo = new TagInfo("mark", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("mark", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("bdi", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("bdi", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("time", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("time", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("meter", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("meter");
|
||||
this.put("meter", tagInfo);
|
||||
|
||||
|
||||
/**
|
||||
* Html5 Ruby text
|
||||
*/
|
||||
tagInfo = new TagInfo("ruby", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags("rt,rp");
|
||||
this.put("ruby", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("rt", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.inline);
|
||||
//
|
||||
// If we include this rule, we get an out-of-memory error. See issue 126.
|
||||
//
|
||||
//tagInfo.defineRequiredEnclosingTags("ruby");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("rt", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("rp", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.inline);
|
||||
//
|
||||
// If we include this rule, we get an out-of-memory error. See issue 126.
|
||||
//
|
||||
//tagInfo.defineRequiredEnclosingTags("ruby");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("rp", tagInfo);
|
||||
|
||||
/**
|
||||
* Html5 media tags
|
||||
*/
|
||||
tagInfo = new TagInfo("audio", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
|
||||
this.put("audio", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("video", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
|
||||
this.put("video", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("source", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.any);
|
||||
tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
|
||||
this.put("source", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("track", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.any);
|
||||
tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
|
||||
this.put("track", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("canvas", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("canvas", tagInfo);
|
||||
|
||||
/**
|
||||
* Html5 interactive tags
|
||||
*/
|
||||
tagInfo = new TagInfo("dialog", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("dialog", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("progress", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("progress");
|
||||
this.put("progress", tagInfo);
|
||||
|
||||
/**
|
||||
* HTML 4 and earlier tags
|
||||
*/
|
||||
|
||||
tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("span", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("meta", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("link", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("title", ContentType.text, BelongsTo.HEAD, false, true, false, CloseTag.required, Display.none);
|
||||
this.put("title", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("style", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("bgsound", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("bgsound", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h1", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h2", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h3", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h4", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h5", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h6", tagInfo);
|
||||
|
||||
// jericho parser requires <p></p>
|
||||
tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("p", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put(STRONG, tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("em", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("abbr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("acronym", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("acronym", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("address", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("bdo", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("blockquote", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("cite", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("q", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("code", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("ins", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("del", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("dfn", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("kbd", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("pre", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("samp", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("listing", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("listing", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("var", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("br", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("wbr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("nobr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("nobr");
|
||||
this.put("nobr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("xmp", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("xmp", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("a");
|
||||
this.put("a", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("base", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
|
||||
this.put("img", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
tagInfo.defineFatalTags("map");
|
||||
tagInfo.defineCloseBeforeTags("area");
|
||||
this.put("area", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags("map");
|
||||
this.put("map", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("object", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("param", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("applet", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.any);
|
||||
this.put("applet", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("xml", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("xml", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("ul", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("ol", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("li", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("dl", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags("dt,dd");
|
||||
this.put("dt", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags("dt,dd");
|
||||
this.put("dd", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("menu", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dir", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("dir", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,colgroup,caption");
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("table", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tbody");
|
||||
tagInfo.defineAllowedChildrenTags("td,th");
|
||||
tagInfo.defineHigherLevelTags("thead,tfoot");
|
||||
tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
|
||||
this.put("tr", tagInfo);
|
||||
|
||||
// jericho parser requires <td></td>
|
||||
tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tr");
|
||||
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
|
||||
this.put("td", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tr");
|
||||
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
|
||||
this.put("th", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("tbody", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("thead", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("tfoot", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineFatalTags("colgroup");
|
||||
this.put("col", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("col");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("colgroup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("caption", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.block);
|
||||
tagInfo.defineForbiddenTags("form");
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("form", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("input", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("textarea", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags("option,optgroup");
|
||||
tagInfo.defineCloseBeforeTags("option,optgroup,select");
|
||||
this.put("select", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("option", ContentType.text, BelongsTo.BODY, false, false, true, CloseTag.optional, Display.inline);
|
||||
tagInfo.defineFatalTags("select");
|
||||
tagInfo.defineCloseBeforeTags("option");
|
||||
this.put("option", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
|
||||
tagInfo.defineFatalTags("select");
|
||||
tagInfo.defineAllowedChildrenTags("option");
|
||||
tagInfo.defineCloseBeforeTags("optgroup");
|
||||
this.put("optgroup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("button", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("label", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
//
|
||||
// If we include this rule, we get an out-of-memory error. See issue 129.
|
||||
//
|
||||
//tagInfo.defineRequiredEnclosingTags("fieldset");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("legend", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("fieldset", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("isindex", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("isindex", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("script", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("script", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("noscript", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.block);
|
||||
this.put("noscript", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("u,i,tt,sub,sup,big,small,strike,blink,s");
|
||||
this.put("b", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,tt,sub,sup,big,small,strike,blink,s");
|
||||
this.put("i", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,i,tt,sub,sup,big,small,strike,blink,s");
|
||||
this.put("u", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,big,small,strike,blink,s");
|
||||
this.put("tt", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sup,big,small,strike,blink,s");
|
||||
this.put("sub", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,big,small,strike,blink,s");
|
||||
this.put("sup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("big", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,small,strike,blink,s");
|
||||
this.put("big", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,strike,blink,s");
|
||||
this.put("small", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("strike", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,blink,s");
|
||||
this.put("strike", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("blink", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,s");
|
||||
this.put("blink", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("marquee", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("marquee", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,blink");
|
||||
this.put("s", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("hr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("font", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
this.put("font", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("basefont", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("basefont", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("center", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("center", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("comment", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("comment", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("server", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("server", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("iframe", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("embed", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("embed", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param key
|
||||
* @param tagInfo
|
||||
*/
|
||||
protected void put(String tagName, TagInfo tagInfo) {
|
||||
this.tagInfoMap.put(tagName, tagInfo);
|
||||
}
|
||||
|
||||
public TagInfo getTagInfo(String tagName) {
|
||||
if ( tagName == null) {
|
||||
// null named tagNode happens when a html fragment is being dealt with
|
||||
return null;
|
||||
} else {
|
||||
return this.tagInfoMap.get(tagName);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* Most HTML 4 elements permitted within the BODY are classified as either
|
||||
* block-level elements or inline elements. This enumeration contains
|
||||
* corresponding constants to distinguish them.
|
||||
*
|
||||
* @author Konstantin Burov (aectann@gmail.com)
|
||||
*
|
||||
*/
|
||||
public enum Display {
|
||||
/**
|
||||
* Block-level elements typically contain inline elements and other
|
||||
* block-level elements. When rendered visually, block-level elements
|
||||
* usually begin on a new line.
|
||||
*/
|
||||
block(true, false),
|
||||
/**
|
||||
* Inline elements typically may only contain text and other inline
|
||||
* elements. When rendered visually, inline elements do not usually begin on
|
||||
* a new line.
|
||||
*/
|
||||
inline(false, true),
|
||||
|
||||
/**
|
||||
* The following elements may be used as either block-level elements or
|
||||
* inline elements. If used as inline elements (e.g., within another inline
|
||||
* element or a P), these elements should not contain any block-level
|
||||
* elements.
|
||||
*/
|
||||
any(true, false),
|
||||
|
||||
/**
|
||||
* Elements that are not actually inline or block, usually such elements are
|
||||
* not rendered at all.
|
||||
*/
|
||||
none(true, false);
|
||||
|
||||
private boolean afterTagLineBreakNeeded;
|
||||
private boolean leadingAndEndWhitespacesAllowed;
|
||||
|
||||
private Display(boolean afterTagLineBreakNeeded, boolean leadingAndEndWhitespacesAllowed) {
|
||||
this.afterTagLineBreakNeeded = afterTagLineBreakNeeded;
|
||||
this.leadingAndEndWhitespacesAllowed = leadingAndEndWhitespacesAllowed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true to advise serializers to put line break after tags with such a display type.
|
||||
*/
|
||||
public boolean isAfterTagLineBreakNeeded() {
|
||||
return afterTagLineBreakNeeded;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if tag contents can have single leading or end whitespace
|
||||
*/
|
||||
public boolean isLeadingAndEndWhitespacesAllowed() {
|
||||
return leadingAndEndWhitespacesAllowed;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -0,0 +1,389 @@
|
||||
/* Copyright (c) 2006-2013, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
|
||||
/**
|
||||
* <p>HTML doctype token.</p>
|
||||
*/
|
||||
public class DoctypeToken extends BaseHtmlNode implements HtmlNode{
|
||||
|
||||
//
|
||||
// Part 1 is the document type, typically 'html' or 'HTML'
|
||||
//
|
||||
private String part1;
|
||||
|
||||
//
|
||||
// Part 2 is the PUBLIC or SYSTEM token
|
||||
//
|
||||
private String part2;
|
||||
|
||||
//
|
||||
// Part 3 is the PUBLIC identifier, typically '-//W3C//DTD HTML 4.01//EN' or similar
|
||||
//
|
||||
private String part3;
|
||||
|
||||
//
|
||||
// Part 4 is the SYSTEM identifier, typically a URL for the DTD
|
||||
//
|
||||
private String part4;
|
||||
|
||||
/**
|
||||
* The identified DocType, if any
|
||||
*/
|
||||
private Integer type = null;
|
||||
|
||||
|
||||
//
|
||||
// Constants for identified doctypes
|
||||
//
|
||||
|
||||
public static final int UNKNOWN = 0;
|
||||
public static final int HTML4_0 = 10;
|
||||
public static final int HTML4_01 = 20;
|
||||
public static final int HTML4_01_STRICT = 21;
|
||||
public static final int HTML4_01_TRANSITIONAL = 22;
|
||||
public static final int HTML4_01_FRAMESET = 23;
|
||||
public static final int XHTML1_0_STRICT = 31;
|
||||
public static final int XHTML1_0_TRANSITIONAL = 32;
|
||||
public static final int XHTML1_0_FRAMESET = 33;
|
||||
public static final int XHTML1_1 = 40;
|
||||
public static final int XHTML1_1_BASIC = 41;
|
||||
public static final int HTML5 = 60;
|
||||
public static final int HTML5_LEGACY_TOOL_COMPATIBLE = 61;
|
||||
|
||||
//
|
||||
// Whether the DocType is valid
|
||||
//
|
||||
private Boolean valid = null;
|
||||
|
||||
public DoctypeToken(String part1, String part2, String part3, String part4) {
|
||||
this.part1 = part1;
|
||||
this.part2 = part2 != null ? part2.toUpperCase() : part2;
|
||||
this.part3 = clean(part3);
|
||||
this.part4 = clean(part4);
|
||||
validate();
|
||||
}
|
||||
|
||||
/*
|
||||
* Constructor for 5-part DocTypes, e.g. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" SYSTEM "http://www.w3.org/TR/html4/strict.dtd">.
|
||||
* For this we ignore part4 as we assume that must be "SYSTEM".
|
||||
*/
|
||||
public DoctypeToken(String part1, String part2, String part3, String part4, String part5) {
|
||||
this.part1 = part1;
|
||||
this.part2 = part2 != null ? part2.toUpperCase() : part2;
|
||||
this.part3 = clean(part3);
|
||||
this.part4 = clean(part5);
|
||||
validate();
|
||||
}
|
||||
|
||||
private String clean(String s) {
|
||||
if (s != null) {
|
||||
s = s.replace('>', ' ');
|
||||
s = s.replace('<', ' ');
|
||||
s = s.replace('&', ' ');
|
||||
s = s.replace('\'', ' ');
|
||||
s = s.replace('\"', ' ');
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
public boolean isValid(){
|
||||
return valid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the doctype according to W3C parsing rules and tries to identify
|
||||
* the type and validity
|
||||
*
|
||||
* See:
|
||||
* <ul>
|
||||
* <li>http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax</li>
|
||||
* <li>http://dev.w3.org/html5/html-author/#doctype-declaration</li>
|
||||
* </ul>
|
||||
*/
|
||||
private void validate() {
|
||||
|
||||
//
|
||||
// No PUBLIC or SYSTEM token
|
||||
//
|
||||
if (!"public".equalsIgnoreCase(part2) && !"system".equalsIgnoreCase(part2)) {
|
||||
|
||||
//
|
||||
// HTML 5
|
||||
//
|
||||
if ("html".equalsIgnoreCase(part1) && (part2 == null)){
|
||||
type = HTML5;
|
||||
valid = true;
|
||||
}
|
||||
}
|
||||
|
||||
if ("public".equalsIgnoreCase(part2)){
|
||||
|
||||
//
|
||||
// HTML 4.0 is valid without an ID, or with strict DTD ID
|
||||
//
|
||||
if ("-//W3C//DTD HTML 4.0//EN".equals(getPublicId())){
|
||||
type = HTML4_0;
|
||||
if ("http://www.w3.org/TR/REC-html40/strict.dtd".equals(part4) || "".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// HTML 4.0.1 STRICT is valid with Strict dtd ID or empty
|
||||
//
|
||||
if ("-//W3C//DTD HTML 4.01//EN".equals(getPublicId())){
|
||||
type = HTML4_01_STRICT;
|
||||
if ("http://www.w3.org/TR/html4/strict.dtd".equals(part4) || "".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// HTML 4.0.1 TRANSITIONAL valid only with Transitional DTD ID
|
||||
//
|
||||
if ("-//W3C//DTD HTML 4.01 Transitional//EN".equals(getPublicId())){
|
||||
type = HTML4_01_TRANSITIONAL;
|
||||
if ("http://www.w3.org/TR/html4/loose.dtd".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// HTML 4.0.1 FRAMESET valid only with Frameset ID
|
||||
//
|
||||
if ("-//W3C//DTD HTML 4.01 Frameset//EN".equals(getPublicId())){
|
||||
type = HTML4_01_FRAMESET;
|
||||
|
||||
if ("http://www.w3.org/TR/html4/frameset.dtd".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// XHTML 1.0
|
||||
//
|
||||
if ("-//W3C//DTD XHTML 1.0 Strict//EN".equals(getPublicId())){
|
||||
type = XHTML1_0_STRICT;
|
||||
if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// XHTML 1.0 Transitional
|
||||
//
|
||||
if ("-//W3C//DTD XHTML 1.0 Transitional//EN".equals(getPublicId())){
|
||||
type = XHTML1_0_TRANSITIONAL;
|
||||
|
||||
if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// XHTML 1.0 Frameset
|
||||
//
|
||||
if ("-//W3C//DTD XHTML 1.0 Frameset//EN".equals(getPublicId())){
|
||||
type = XHTML1_0_FRAMESET;
|
||||
|
||||
if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// XHTML 1.1
|
||||
//
|
||||
if ("-//W3C//DTD XHTML 1.1//EN".equals(getPublicId())){
|
||||
type = XHTML1_1;
|
||||
if ("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// XHTML 1.1 Basic
|
||||
//
|
||||
if ("-//W3C//DTD XHTML Basic 1.1//EN".equals(getPublicId())){
|
||||
type = XHTML1_1_BASIC;
|
||||
|
||||
if ("http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd".equals(getSystemId())){
|
||||
valid = true;
|
||||
} else {
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ("system".equalsIgnoreCase(part2)){
|
||||
|
||||
//
|
||||
// HTML 5 legacy tool compatible
|
||||
//
|
||||
if ("about:legacy-compat".equals(getPublicId())){
|
||||
type = HTML5_LEGACY_TOOL_COMPATIBLE;
|
||||
valid = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (type == null){
|
||||
type = UNKNOWN;
|
||||
valid = false;
|
||||
}
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
|
||||
if (type == UNKNOWN && part1 == null){
|
||||
return "<!DOCTYPE>";
|
||||
}
|
||||
|
||||
String result = "<!DOCTYPE ";
|
||||
|
||||
//
|
||||
// If the type is XHTML or HTML5, the output is "html", otherwise it should be "HTML"
|
||||
//
|
||||
if (type != UNKNOWN){
|
||||
if (type >= 30){
|
||||
result += "html";
|
||||
} else {
|
||||
result += "HTML";
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// if its an unknown doctype, just pass through as-is.
|
||||
//
|
||||
result += part1 ;
|
||||
}
|
||||
|
||||
|
||||
if (part2 != null){
|
||||
result += " " + part2 + " \"" + part3 + "\"";
|
||||
|
||||
if (!"".equals(part4) ) {
|
||||
result += " \"" + part4 + "\"";
|
||||
}
|
||||
}
|
||||
|
||||
result += ">";
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getContent();
|
||||
}
|
||||
|
||||
/**
|
||||
* This will retrieve an integer representing the identified DocType
|
||||
*/
|
||||
public int getType(){
|
||||
return type;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return "";
|
||||
}
|
||||
|
||||
public void serialize(Serializer serializer, Writer writer) throws IOException {
|
||||
writer.write(getContent() + "\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* This will retrieve the public ID of an externally referenced DTD, or an empty String if none is referenced.
|
||||
*/
|
||||
public String getPublicId(){
|
||||
return part3;
|
||||
}
|
||||
|
||||
/**
|
||||
* This will retrieve the system ID of an externally referenced DTD, or an empty String if none is referenced.
|
||||
*/
|
||||
public String getSystemId(){
|
||||
return part4;
|
||||
}
|
||||
|
||||
public String getPart1() {
|
||||
return part1;
|
||||
}
|
||||
|
||||
public String getPart2() {
|
||||
return part2;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deprecated - use getPublicId() instead
|
||||
* @return the third part of the DOCSTRING
|
||||
*/
|
||||
@Deprecated
|
||||
public String getPart3() {
|
||||
return part3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deprecated - use getSystemId() instead
|
||||
* @return the fourth part of the DOCSTRING
|
||||
*/
|
||||
@Deprecated
|
||||
public String getPart4() {
|
||||
return part4;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,275 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.w3c.dom.CDATASection;
|
||||
import org.w3c.dom.Comment;
|
||||
import org.w3c.dom.DOMImplementation;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.DocumentType;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
public class DomBuilder implements XmlVisitor{
|
||||
|
||||
private Document document;
|
||||
private Element destinationElement;
|
||||
private CleanerProperties props;
|
||||
|
||||
protected boolean escapeXml = true;
|
||||
protected boolean deserializeCdataEntities = false;
|
||||
protected boolean strictErrorChecking = true;
|
||||
|
||||
private static final String CSS_COMMENT_START = "/*";
|
||||
|
||||
public DomBuilder(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
this.deserializeCdataEntities = deserializeCdataEntities;
|
||||
this.strictErrorChecking = strictErrorChecking;
|
||||
}
|
||||
|
||||
public Document getDocument(){
|
||||
return this.document;
|
||||
}
|
||||
|
||||
private boolean shouldEscapeOrTranslateEntities() {
|
||||
return escapeXml || props.isRecognizeUnicodeChars() || props.isTranslateSpecialEntities();
|
||||
}
|
||||
|
||||
public void head(HtmlNode node, int depth) {
|
||||
|
||||
//
|
||||
// For script and style nodes, check if we're set to use CDATA
|
||||
//
|
||||
CDATASection cdata = null;
|
||||
if (node instanceof TagNode && props.isUseCdataFor(((TagNode)node).getName())){
|
||||
cdata = document.createCDATASection("");
|
||||
destinationElement.appendChild(document.createTextNode(CSS_COMMENT_START));
|
||||
destinationElement.appendChild(cdata);
|
||||
}
|
||||
|
||||
if (node instanceof CommentNode) {
|
||||
|
||||
CommentNode commentNode = (CommentNode) node;
|
||||
Comment comment = document.createComment( commentNode.getContent() );
|
||||
destinationElement.appendChild(comment);
|
||||
|
||||
} else if (node instanceof ContentNode) {
|
||||
|
||||
ContentNode contentNode = (ContentNode) node;
|
||||
String content = contentNode.getContent();
|
||||
boolean specialCase = props.isUseCdataFor(node.getParent().getName());
|
||||
|
||||
if (shouldEscapeOrTranslateEntities() && !specialCase) {
|
||||
content = Utils.escapeXml(content, props, true);
|
||||
}
|
||||
|
||||
if (specialCase && node instanceof CData){
|
||||
//
|
||||
// For CDATA sections we don't want to return the start and
|
||||
// end tokens. See issue #106.
|
||||
//
|
||||
content = ((CData)node).getContentWithoutStartAndEndTokens();
|
||||
}
|
||||
|
||||
if (specialCase && deserializeCdataEntities){
|
||||
content = this.deserializeCdataEntities(content);
|
||||
}
|
||||
|
||||
if (cdata != null){
|
||||
cdata.appendData(content);
|
||||
} else {
|
||||
destinationElement.appendChild(document.createTextNode(content) );
|
||||
}
|
||||
|
||||
|
||||
} else if (node instanceof TagNode) {
|
||||
|
||||
TagNode subTagNode = (TagNode) node;
|
||||
|
||||
//
|
||||
// XML element names are more strict in their definition
|
||||
// than HTML tag identifiers.
|
||||
// See https://www.w3.org/TR/xml/#NT-Name
|
||||
// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
//
|
||||
String name = Utils.sanitizeXmlIdentifier(subTagNode.getName(), props.getInvalidXmlAttributeNamePrefix());
|
||||
|
||||
//
|
||||
// If the element name is completely invalid, treat it as text
|
||||
//
|
||||
if (name == null){
|
||||
ContentNode contentNode = new ContentNode(subTagNode.getName() + subTagNode.getText().toString());
|
||||
String content = contentNode.getContent();
|
||||
content = Utils.escapeXml(content, props, true);
|
||||
destinationElement.appendChild(document.createTextNode(content) );
|
||||
|
||||
} else {
|
||||
|
||||
if (document == null){
|
||||
try {
|
||||
document = this.createDocument(subTagNode);
|
||||
} catch (ParserConfigurationException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
Element element = document.createElement( name );
|
||||
|
||||
//
|
||||
// Create attributes
|
||||
//
|
||||
Map<String, String> attributes = subTagNode.getAttributes();
|
||||
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
|
||||
while (entryIterator.hasNext()) {
|
||||
Map.Entry<String, String> entry = entryIterator.next();
|
||||
String attrName = entry.getKey();
|
||||
String attrValue = entry.getValue();
|
||||
if (escapeXml) {
|
||||
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
|
||||
attrValue = Utils.escapeXml(attrValue, props, true);
|
||||
}
|
||||
|
||||
//
|
||||
// Fix any invalid attribute names by adding a prefix
|
||||
//
|
||||
if (!props.isAllowInvalidAttributeNames()){
|
||||
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
|
||||
}
|
||||
|
||||
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
|
||||
element.setAttribute(attrName, attrValue);
|
||||
|
||||
//
|
||||
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
|
||||
//
|
||||
if (attrName.equalsIgnoreCase("id")) {
|
||||
element.setIdAttribute(attrName, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (destinationElement == null){
|
||||
destinationElement = document.getDocumentElement();
|
||||
} else {
|
||||
destinationElement.appendChild(element);
|
||||
destinationElement = element;
|
||||
}
|
||||
|
||||
//
|
||||
// Hack for now, we need a better way to do this in future
|
||||
//
|
||||
for (Object token: subTagNode.getAllChildren()){
|
||||
if (token instanceof ContentNode){
|
||||
((ContentNode)token).setParent(subTagNode);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected String deserializeCdataEntities(String input){
|
||||
return Utils.deserializeEntities(input, props.isRecognizeUnicodeChars());
|
||||
}
|
||||
|
||||
public void tail(HtmlNode node, int depth) {
|
||||
if (node instanceof TagNode && destinationElement.getParentNode() instanceof Element) {
|
||||
destinationElement = (Element) destinationElement.getParentNode();
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Allow overriding of serialization for implementations. See bug #167.
|
||||
//
|
||||
protected Document createDocument(TagNode rootNode) throws ParserConfigurationException{
|
||||
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
DOMImplementation impl = builder.getDOMImplementation();
|
||||
|
||||
Document document;
|
||||
|
||||
//
|
||||
// Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
|
||||
//
|
||||
// Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
|
||||
// common patterns for errors with the older HTML4 doctypes.
|
||||
//
|
||||
if (rootNode.getDocType() != null){
|
||||
String qualifiedName = rootNode.getDocType().getPart1();
|
||||
String publicId = rootNode.getDocType().getPublicId();
|
||||
String systemId = rootNode.getDocType().getSystemId();
|
||||
|
||||
//
|
||||
// If there is no qualified name, set it to html. See bug #153.
|
||||
//
|
||||
if (qualifiedName == null) qualifiedName = "html";
|
||||
|
||||
DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);
|
||||
|
||||
//
|
||||
// While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html". See bug #116
|
||||
//
|
||||
if (qualifiedName.equals("HTML")) qualifiedName = "html";
|
||||
document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
|
||||
} else {
|
||||
document = builder.newDocument();
|
||||
Element rootElement = document.createElement(rootNode.getName());
|
||||
document.appendChild(rootElement);
|
||||
}
|
||||
|
||||
//
|
||||
// Turn off error checking if we're allowing invalid attribute names, or if we've chosen to turn it off
|
||||
//
|
||||
if (props.isAllowInvalidAttributeNames() || strictErrorChecking == false){
|
||||
document.setStrictErrorChecking(false);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
|
||||
//
|
||||
Map<String, String> attributes = rootNode.getAttributes();
|
||||
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
|
||||
while (entryIterator.hasNext()) {
|
||||
Map.Entry<String, String> entry = entryIterator.next();
|
||||
String attrName = entry.getKey();
|
||||
String attrValue = entry.getValue();
|
||||
|
||||
//
|
||||
// Fix any invalid attribute names
|
||||
//
|
||||
if (!props.isAllowInvalidAttributeNames()){
|
||||
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
|
||||
}
|
||||
|
||||
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
|
||||
|
||||
if (escapeXml) {
|
||||
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
|
||||
attrValue = Utils.escapeXml(attrValue, props, true);
|
||||
}
|
||||
|
||||
document.getDocumentElement().setAttribute(attrName, attrValue);
|
||||
|
||||
//
|
||||
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
|
||||
//
|
||||
if (attrName.equalsIgnoreCase("id")) {
|
||||
document.getDocumentElement().setIdAttribute(attrName, true);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,410 @@
|
||||
/* Copyright (c) 2006-2019, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import org.w3c.dom.CDATASection;
|
||||
import org.w3c.dom.Comment;
|
||||
import org.w3c.dom.DOMImplementation;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.DocumentType;
|
||||
import org.w3c.dom.Element;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* <p>DOM serializer - creates xml DOM.</p>
|
||||
*/
|
||||
public class DomSerializer {
|
||||
|
||||
private static final String CSS_COMMENT_START = "/*";
|
||||
|
||||
private static final String CSS_COMMENT_END = "*/";
|
||||
|
||||
private static final String NEW_LINE = "\n";
|
||||
|
||||
private static final String XML_10 = "1.0";
|
||||
|
||||
private static final String XML_11 = "1.1";
|
||||
|
||||
/**
|
||||
* The HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
*/
|
||||
protected CleanerProperties props;
|
||||
|
||||
/**
|
||||
* Whether XML entities should be escaped or not.
|
||||
*/
|
||||
protected boolean escapeXml = true;
|
||||
|
||||
protected boolean deserializeCdataEntities = false;
|
||||
|
||||
protected boolean strictErrorChecking = true;
|
||||
|
||||
protected String xmlVersion = XML_10;
|
||||
|
||||
public String getXmlVersion() {
|
||||
return xmlVersion;
|
||||
}
|
||||
|
||||
public void setXmlVersion(String xmlVersion) throws Exception {
|
||||
if (xmlVersion == XML_10 || xmlVersion == XML_11) {
|
||||
this.xmlVersion = xmlVersion;
|
||||
} else {
|
||||
throw new Exception("Invalid XML version - must be 1.0 or 1.1");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
* @param escapeXml if true then escape XML entities
|
||||
* @param deserializeCdataEntities if true then deserialize entities in CData sections
|
||||
* @param strictErrorChecking if false then Document strict error checking is turned off
|
||||
*/
|
||||
public DomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
this.deserializeCdataEntities = deserializeCdataEntities;
|
||||
this.strictErrorChecking = strictErrorChecking;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
* @param escapeXml if true then escape XML entities
|
||||
* @param deserializeCdataEntities if true then deserialize entities in CData sections
|
||||
*/
|
||||
public DomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities) {
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
this.deserializeCdataEntities = deserializeCdataEntities;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
* @param escapeXml if true then escape XML entities
|
||||
*/
|
||||
public DomSerializer(CleanerProperties props, boolean escapeXml) {
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
*/
|
||||
public DomSerializer(CleanerProperties props) {
|
||||
this(props, true);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Allow overriding of serialization for implementations. See bug #167.
|
||||
//
|
||||
protected Document createDocument(TagNode rootNode) throws ParserConfigurationException{
|
||||
|
||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||
DocumentBuilder builder = factory.newDocumentBuilder();
|
||||
DOMImplementation impl = builder.getDOMImplementation();
|
||||
|
||||
Document document;
|
||||
|
||||
//
|
||||
// Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
|
||||
//
|
||||
// Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
|
||||
// common patterns for errors with the older HTML4 doctypes.
|
||||
//
|
||||
if (rootNode.getDocType() != null){
|
||||
String qualifiedName = rootNode.getDocType().getPart1();
|
||||
String publicId = rootNode.getDocType().getPublicId();
|
||||
String systemId = rootNode.getDocType().getSystemId();
|
||||
|
||||
//
|
||||
// If there is no qualified name, set it to html. See bug #153.
|
||||
//
|
||||
if (qualifiedName == null) qualifiedName = "html";
|
||||
|
||||
DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);
|
||||
|
||||
//
|
||||
// While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html". See bug #116
|
||||
//
|
||||
if (qualifiedName.equals("HTML")) qualifiedName = "html";
|
||||
document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
|
||||
document.setXmlVersion(xmlVersion);
|
||||
} else {
|
||||
document = builder.newDocument();
|
||||
document.setXmlVersion(xmlVersion);
|
||||
Element rootElement = document.createElement(rootNode.getName());
|
||||
document.appendChild(rootElement);
|
||||
}
|
||||
|
||||
//
|
||||
// Turn off error checking if we're allowing invalid attribute names, or if we've chosen to turn it off
|
||||
//
|
||||
if (props.isAllowInvalidAttributeNames() || strictErrorChecking == false){
|
||||
document.setStrictErrorChecking(false);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
|
||||
//
|
||||
Map<String, String> attributes = rootNode.getAttributes();
|
||||
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
|
||||
while (entryIterator.hasNext()) {
|
||||
Map.Entry<String, String> entry = entryIterator.next();
|
||||
String attrName = entry.getKey();
|
||||
String attrValue = entry.getValue();
|
||||
|
||||
//
|
||||
// Fix any invalid attribute names
|
||||
//
|
||||
if (!props.isAllowInvalidAttributeNames()){
|
||||
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
|
||||
}
|
||||
|
||||
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
|
||||
|
||||
if (escapeXml) {
|
||||
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
|
||||
attrValue = Utils.escapeXml(attrValue, props, true);
|
||||
}
|
||||
|
||||
document.getDocumentElement().setAttribute(attrName, attrValue);
|
||||
|
||||
//
|
||||
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
|
||||
//
|
||||
if (attrName.equalsIgnoreCase("id")) {
|
||||
document.getDocumentElement().setIdAttribute(attrName, true);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param rootNode the HTML Cleaner root node to serialize
|
||||
* @return the W3C Document object
|
||||
* @throws ParserConfigurationException if there's an error during serialization
|
||||
*/
|
||||
public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
|
||||
Document document = createDocument(rootNode);
|
||||
createSubnodes(document, (Element)document.getDocumentElement(), rootNode.getAllChildren());
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param element the element to check
|
||||
* @return true if the passed element is a script or style element
|
||||
*/
|
||||
protected boolean isScriptOrStyle(Element element) {
|
||||
String tagName = element.getNodeName();
|
||||
return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
|
||||
}
|
||||
/**
|
||||
* encapsulate content with <[CDATA[ ]]> for things like script and style elements
|
||||
* @param element
|
||||
* @return true if <[CDATA[ ]]> should be used.
|
||||
*/
|
||||
protected boolean dontEscape(Element element) {
|
||||
// make sure <script src=..></script> doesn't get turned into <script src=..><[CDATA[]]></script>
|
||||
return props.isUseCdataFor(element.getNodeName()) && (!element.hasChildNodes() || element.getTextContent() == null || element.getTextContent().trim().length() == 0);
|
||||
}
|
||||
|
||||
protected String outputCData(CData cdata){
|
||||
return cdata.getContentWithoutStartAndEndTokens();
|
||||
}
|
||||
|
||||
protected String deserializeCdataEntities(String input){
|
||||
return Utils.deserializeEntities(input, props.isRecognizeUnicodeChars());
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize a given HTML Cleaner node.
|
||||
*
|
||||
* @param document the W3C Document to use for creating new DOM elements
|
||||
* @param element the W3C element to which we'll add the subnodes to
|
||||
* @param tagChildren the HTML Cleaner nodes to serialize for that node
|
||||
*/
|
||||
protected void createSubnodes(Document document, Element element, List<? extends BaseToken> tagChildren) {
|
||||
|
||||
if (tagChildren != null) {
|
||||
|
||||
CDATASection cdata = null;
|
||||
|
||||
//
|
||||
// For script and style nodes, check if we're set to use CDATA
|
||||
//
|
||||
if (props.isUseCdataFor(element.getTagName())){
|
||||
cdata = document.createCDATASection("");
|
||||
element.appendChild(document.createTextNode(CSS_COMMENT_START));
|
||||
element.appendChild(cdata);
|
||||
}
|
||||
|
||||
Iterator<? extends BaseToken> it = tagChildren.iterator();
|
||||
while (it.hasNext()) {
|
||||
|
||||
Object item = it.next();
|
||||
if (item instanceof CommentNode) {
|
||||
|
||||
CommentNode commentNode = (CommentNode) item;
|
||||
Comment comment = document.createComment( commentNode.getContent() );
|
||||
element.appendChild(comment);
|
||||
|
||||
} else if (item instanceof ContentNode) {
|
||||
|
||||
ContentNode contentNode = (ContentNode) item;
|
||||
String content = contentNode.getContent();
|
||||
boolean specialCase = props.isUseCdataFor(element.getTagName());
|
||||
|
||||
if (props.isRecognizeUnicodeChars() && props.isTranslateSpecialEntities()) {
|
||||
content = Utils.deserializeEntities(content, props.isRecognizeUnicodeChars());
|
||||
}
|
||||
|
||||
if ((escapeXml || props.isTranslateSpecialEntities()) && !specialCase) {
|
||||
content = Utils.escapeXml(content, props, true);
|
||||
}
|
||||
|
||||
if (specialCase && item instanceof CData){
|
||||
//
|
||||
// For CDATA sections we don't want to return the start and
|
||||
// end tokens. See issue #106.
|
||||
//
|
||||
content = ((CData)item).getContentWithoutStartAndEndTokens();
|
||||
}
|
||||
|
||||
if (specialCase && deserializeCdataEntities){
|
||||
content = this.deserializeCdataEntities(content);
|
||||
}
|
||||
|
||||
if (cdata != null){
|
||||
cdata.appendData(content);
|
||||
} else {
|
||||
element.appendChild(document.createTextNode(content) );
|
||||
}
|
||||
|
||||
|
||||
} else if (item instanceof TagNode) {
|
||||
|
||||
TagNode subTagNode = (TagNode) item;
|
||||
|
||||
//
|
||||
// XML element names are more strict in their definition
|
||||
// than HTML tag identifiers.
|
||||
// See https://www.w3.org/TR/xml/#NT-Name
|
||||
// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
//
|
||||
String name = Utils.sanitizeXmlIdentifier(subTagNode.getName(), props.getInvalidXmlAttributeNamePrefix());
|
||||
|
||||
//
|
||||
// If the element name is completely invalid, treat it as text
|
||||
//
|
||||
if (name == null){
|
||||
ContentNode contentNode = new ContentNode(subTagNode.getName() + subTagNode.getText().toString());
|
||||
String content = contentNode.getContent();
|
||||
content = Utils.escapeXml(content, props, true);
|
||||
element.appendChild(document.createTextNode(content) );
|
||||
|
||||
} else {
|
||||
|
||||
Element subelement = document.createElement( name );
|
||||
Map<String, String> attributes = subTagNode.getAttributes();
|
||||
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
|
||||
while (entryIterator.hasNext()) {
|
||||
Map.Entry<String, String> entry = entryIterator.next();
|
||||
String attrName = entry.getKey();
|
||||
String attrValue = entry.getValue();
|
||||
if (escapeXml) {
|
||||
attrValue = Utils.deserializeEntities(attrValue, true);
|
||||
attrValue = Utils.escapeXml(attrValue, props, true);
|
||||
}
|
||||
|
||||
//
|
||||
// Fix any invalid attribute names by adding a prefix
|
||||
//
|
||||
if (!props.isAllowInvalidAttributeNames()){
|
||||
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
|
||||
}
|
||||
|
||||
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
|
||||
subelement.setAttribute(attrName, attrValue);
|
||||
|
||||
//
|
||||
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
|
||||
//
|
||||
if (attrName.equalsIgnoreCase("id")) {
|
||||
subelement.setIdAttribute(attrName, true);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// recursively create subnodes
|
||||
createSubnodes(document, subelement, subTagNode.getAllChildren());
|
||||
|
||||
element.appendChild(subelement);
|
||||
}
|
||||
} else if (item instanceof List) {
|
||||
List<? extends BaseToken> sublist = (List<? extends BaseToken>) item;
|
||||
createSubnodes(document, element, sublist);
|
||||
}
|
||||
|
||||
}
|
||||
if (cdata != null){
|
||||
|
||||
if (!cdata.getData().startsWith(NEW_LINE)){
|
||||
cdata.setData(CSS_COMMENT_END + NEW_LINE + cdata.getData());
|
||||
} else {
|
||||
cdata.setData(CSS_COMMENT_END + cdata.getData());
|
||||
}
|
||||
if (!cdata.getData().endsWith(NEW_LINE)){
|
||||
|
||||
cdata.appendData(NEW_LINE);
|
||||
}
|
||||
cdata.appendData(CSS_COMMENT_START);
|
||||
element.appendChild(document.createTextNode(CSS_COMMENT_END));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.Writer;
|
||||
|
||||
|
||||
/**
|
||||
* <p>HTML tag end token.</p>
|
||||
*/
|
||||
public class EndTagToken extends TagToken {
|
||||
|
||||
public EndTagToken() {
|
||||
}
|
||||
|
||||
public EndTagToken(String name) {
|
||||
super(name == null ? null : name);
|
||||
}
|
||||
|
||||
@Override
|
||||
void addAttribute(String attName, String attValue) {
|
||||
// do nothing - simply ignore attributes in closing tag
|
||||
}
|
||||
|
||||
public void serialize(Serializer serializer, Writer writer) {
|
||||
// do nothing - simply ignore serialization
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "endtoken" + super.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,535 @@
|
||||
/* Copyright (c) 2006-2015, Philokypros Ioulianou
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Philokypros Ioulianou by sending e-mail to
|
||||
philokypro_s@hotmail.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
public class Html4TagProvider implements ITagInfoProvider {
|
||||
|
||||
private static final String STRONG = "strong";
|
||||
private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
|
||||
// singleton instance, used if no other TagInfoProvider is specified
|
||||
public final static Html4TagProvider INSTANCE= new Html4TagProvider();
|
||||
|
||||
private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo,"+STRONG+",em,q,b,i,u,tt,sub,sup,big,small,strike,s,font";
|
||||
private static final String CLOSE_BEFORE_TAGS = "p,details,summary,menuitem,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
|
||||
|
||||
/**
|
||||
* Phrasing tags are those that can make up paragraphs along with text to make Phrasing Content
|
||||
*/
|
||||
private static final String PHRASING_TAGS = "a,abbr,area,b,bdi,bdo,br,button,canvas,cite,code,command,data,datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,keygen,label,link,map,mark,math,meta,meter,noscript,object,output,progress,q,s,samp,script,select,small,span,strong,sub,sup,svg,template,text,textarea,time,u,var,wbr";
|
||||
|
||||
public Html4TagProvider() {
|
||||
|
||||
TagInfo tagInfo=null;
|
||||
basicElements(tagInfo);
|
||||
formattingElements(tagInfo);
|
||||
formElements(tagInfo);
|
||||
imgElements(tagInfo);
|
||||
listElements(tagInfo);
|
||||
linkElements(tagInfo);
|
||||
tableElements(tagInfo);
|
||||
styleElements(tagInfo);
|
||||
olderElements(tagInfo);
|
||||
scriptElements(tagInfo);
|
||||
}
|
||||
|
||||
public void basicElements(TagInfo tagInfo){
|
||||
|
||||
tagInfo = new TagInfo("title", ContentType.text, BelongsTo.HEAD, false, true, false, CloseTag.required, Display.none);
|
||||
this.put("title", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h1", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h2", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h3", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h4", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h5", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("h6", tagInfo);
|
||||
|
||||
// jericho parser requires <p></p>
|
||||
tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("p", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("br", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("hr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("div", tagInfo);
|
||||
}
|
||||
|
||||
|
||||
public void formattingElements(TagInfo tagInfo){
|
||||
|
||||
tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("abbr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("acronym", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("acronym", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("address", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("u,i,tt,sub,sup,big,small,strike,blink,s");
|
||||
this.put("b", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("bdo", tagInfo);
|
||||
|
||||
|
||||
tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("blockquote", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("cite", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("q", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("code", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("ins", tagInfo);
|
||||
|
||||
|
||||
tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,tt,sub,sup,big,small,strike,blink,s");
|
||||
this.put("i", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,i,tt,sub,sup,big,small,strike,blink,s");
|
||||
this.put("u", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,big,small,strike,blink,s");
|
||||
this.put("tt", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sup,big,small,strike,blink,s");
|
||||
this.put("sub", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,big,small,strike,blink,s");
|
||||
this.put("sup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("big", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,small,strike,blink,s");
|
||||
this.put("big", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,strike,blink,s");
|
||||
this.put("small", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("strike", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,blink,s");
|
||||
this.put("strike", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("blink", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,s");
|
||||
this.put("blink", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("marquee", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("marquee", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,blink");
|
||||
this.put("s", tagInfo);
|
||||
|
||||
|
||||
tagInfo = new TagInfo("font", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
|
||||
this.put("font", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("basefont", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("basefont", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("center", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("center", tagInfo);
|
||||
|
||||
|
||||
tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("del", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("dfn", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("kbd", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("pre", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("samp", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put(STRONG, tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("em", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("var", tagInfo);
|
||||
|
||||
|
||||
tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("wbr", tagInfo);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void formElements(TagInfo tagInfo){
|
||||
tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.block);
|
||||
tagInfo.defineForbiddenTags("form");
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("form", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("input", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("textarea", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags("option,optgroup");
|
||||
tagInfo.defineCloseBeforeTags("option,optgroup,select");
|
||||
this.put("select", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("option", ContentType.text, BelongsTo.BODY, false, false, true, CloseTag.optional, Display.inline);
|
||||
tagInfo.defineFatalTags("select");
|
||||
tagInfo.defineCloseBeforeTags("option");
|
||||
this.put("option", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
|
||||
tagInfo.defineFatalTags("select");
|
||||
tagInfo.defineAllowedChildrenTags("option");
|
||||
tagInfo.defineCloseBeforeTags("optgroup");
|
||||
this.put("optgroup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("button", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("label", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
//
|
||||
// If we include this rule, we get an out-of-memory error. See issue 129.
|
||||
//
|
||||
//tagInfo.defineRequiredEnclosingTags("fieldset");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("legend", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("fieldset", tagInfo);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void listElements(TagInfo tagInfo){
|
||||
|
||||
tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("ul", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("ol", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("li", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("dl", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags("dt,dd");
|
||||
this.put("dt", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags("dt,dd");
|
||||
this.put("dd", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("menu", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dir", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("dir", tagInfo);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void linkElements(TagInfo tagInfo){
|
||||
|
||||
tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("link", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("a");
|
||||
this.put("a", tagInfo);
|
||||
}
|
||||
|
||||
|
||||
public void tableElements(TagInfo tagInfo){
|
||||
|
||||
tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,colgroup,caption");
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("table", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tbody");
|
||||
tagInfo.defineAllowedChildrenTags("td,th");
|
||||
tagInfo.defineHigherLevelTags("thead,tfoot");
|
||||
tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
|
||||
this.put("tr", tagInfo);
|
||||
|
||||
// jericho parser requires <td></td>
|
||||
tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tr");
|
||||
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
|
||||
this.put("td", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tr");
|
||||
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
|
||||
this.put("th", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("tbody", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("thead", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("tfoot", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineFatalTags("colgroup");
|
||||
this.put("col", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("col");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("colgroup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("caption", tagInfo);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void styleElements(TagInfo tagInfo){
|
||||
|
||||
tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("span", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("style", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("bgsound", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("bgsound", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("meta", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("base", tagInfo);
|
||||
}
|
||||
|
||||
|
||||
public void scriptElements(TagInfo tagInfo){
|
||||
|
||||
tagInfo = new TagInfo("script", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("script", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("noscript", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.block);
|
||||
this.put("noscript", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("applet", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.any);
|
||||
this.put("applet", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("object", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("param", tagInfo);
|
||||
}
|
||||
|
||||
|
||||
public void imgElements(TagInfo tagInfo){
|
||||
tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
|
||||
this.put("img", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
|
||||
tagInfo.defineFatalTags("map");
|
||||
tagInfo.defineCloseBeforeTags("area");
|
||||
this.put("area", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags("map");
|
||||
this.put("map", tagInfo);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void olderElements(TagInfo tagInfo){
|
||||
tagInfo = new TagInfo("listing", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("listing", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("nobr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("nobr");
|
||||
this.put("nobr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("xmp", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
this.put("xmp", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("xml", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("xml", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("isindex", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("isindex", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("comment", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("comment", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("server", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
|
||||
this.put("server", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
|
||||
this.put("iframe", tagInfo);
|
||||
|
||||
}
|
||||
|
||||
|
||||
protected void put(String tagName, TagInfo tagInfo) {
|
||||
this.tagInfoMap.put(tagName, tagInfo);
|
||||
}
|
||||
|
||||
public TagInfo getTagInfo(String tagName) {
|
||||
if ( tagName == null) {
|
||||
// null named tagNode happens when a html fragment is being dealt with
|
||||
return null;
|
||||
} else {
|
||||
return this.tagInfoMap.get(tagName.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,885 @@
|
||||
/* Copyright (c) 2006-2017, Philokypros Ioulianou and the HTMLCleaner team
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Philokypros Ioulianou by sending e-mail to
|
||||
philokypro_s@hotmail.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
public class Html5TagProvider implements ITagInfoProvider {
|
||||
|
||||
private static final String STRONG = "strong";
|
||||
private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
|
||||
// singleton instance, used if no other TagInfoProvider is specified
|
||||
public final static Html5TagProvider INSTANCE = new Html5TagProvider();
|
||||
public MathMLTagProvider INSTANCE2;
|
||||
|
||||
private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo," + STRONG
|
||||
+ ",em,q,b,i,sub,sup,small,s";
|
||||
private static final String CLOSE_BEFORE_TAGS = "p,summary,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
|
||||
|
||||
// private static final String CLOSE_BEFORE_TAGS =
|
||||
// "h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
|
||||
|
||||
/**
|
||||
* Phrasing tags are those that can make up paragraphs along with text to
|
||||
* make Phrasing Content. Generally speaking, phrasing content only allows phrasing content as child tags.
|
||||
*/
|
||||
private static final String PHRASING_TAGS = "a,abbr,area,audio,b,bdi,bdo,br,button,canvas,cite,code,command,datalist,del,dfn,em,i,input,ins,kbd,keygen,label,link,map,mark,meta,meter,noscript,output,progress,p,ruby,samp,s,script,select,small,span,strong,sub,sup,svg,template,textarea,time,u,var,wbr";
|
||||
|
||||
/**
|
||||
* Most elements that are used in the body of documents and applications are categorized as flow content.
|
||||
*/
|
||||
private static final String FLOW_TAGS = "a,abbr,address,area,article,aside,audio,b,bdi,bdo,blockquote,br,button,canvas,cite,code,data,datalist,del,dfn,div,dl,em,embed,fieldset,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hr,i,iframe,img,input,ins,kbd,keygen,label,main,map,mark,math,meter,nav,noscript,object,ol,output,p,pre,progress,q,ruby,s,samp,script,section,select,small,span,strong,sub,sup,svg,table,template,textarea,time,u,ul,var,video,wbr,text";
|
||||
|
||||
/**
|
||||
* HTML5 Media Tags
|
||||
*/
|
||||
private static final String MEDIA_TAGS = "audio,video,object,source";
|
||||
|
||||
private static final String SCRIPT_SUPPORTING_TAGS = "script,template";
|
||||
|
||||
public Html5TagProvider() {
|
||||
TagInfo tagInfo = null;
|
||||
|
||||
embeddedContentTags(tagInfo);
|
||||
semanticFlowTags(tagInfo);
|
||||
interactiveTags(tagInfo);
|
||||
groupingTags(tagInfo);
|
||||
phrasingTags(tagInfo);
|
||||
mediaTags(tagInfo);
|
||||
editTags(tagInfo);
|
||||
formTags(tagInfo);
|
||||
tableTags(tagInfo);
|
||||
metadataTags(tagInfo);
|
||||
scriptingTags(tagInfo);
|
||||
//INSTANCE2 = new MathMLTagProvider(tagInfo, tagInfoMap);
|
||||
}
|
||||
|
||||
public void embeddedContentTags(TagInfo tagInfo) {
|
||||
|
||||
// SVG
|
||||
tagInfo = new TagInfo("svg", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineAllowedChildrenTags("animate,animateMotion,animateTransform,discard,set,desc,title,metadata,linearGradient,radialGradient,pattern,circle,ellipse,line,path,polygon,polyline,rect,defs,g,svg,symbol,use,a,audio,canvas,clipPath,filter,foreignObject,iframe,image,marker,mask,script,style,switch,text,video,view");
|
||||
tagInfo.setAssumedNamespace("http://www.w3.org/2000/svg");
|
||||
tagInfo.setAssumedNamespacePrefix("svg");
|
||||
this.put("svg", tagInfo);
|
||||
|
||||
// MathML
|
||||
tagInfo = new TagInfo("math", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("math,summary,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
//tagInfo.defineForbiddenTags("math");
|
||||
//
|
||||
// We'll add this later - right now it causes more problems than it solves
|
||||
// as there are no tag name clashes between MathML and HTML unlike in SVG.
|
||||
//
|
||||
tagInfo.setAssumedNamespace("http://www.w3.org/1998/Math/MathML");
|
||||
tagInfo.setAssumedNamespacePrefix("mathml");
|
||||
//
|
||||
this.put("math", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* The HTML5 semantic flow tags-Sectioning tags (15 total)
|
||||
*
|
||||
*/
|
||||
public void semanticFlowTags(TagInfo tagInfo) {
|
||||
|
||||
tagInfo = new TagInfo("section", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("section", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("nav", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("nav", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("article", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineForbiddenTags("menu");
|
||||
this.put("article", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("aside", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineForbiddenTags("menu");
|
||||
tagInfo.defineForbiddenTags("address");
|
||||
this.put("aside", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
|
||||
this.put("h1", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
|
||||
this.put("h2", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
|
||||
this.put("h3", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
|
||||
this.put("h4", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
|
||||
this.put("h5", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
|
||||
this.put("h6", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("hgroup", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineAllowedChildrenTags("h1,h2,h3,h4,h5,h6");
|
||||
this.put("hgroup", tagInfo);
|
||||
|
||||
// header and footer
|
||||
tagInfo = new TagInfo("header", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineForbiddenTags("menu,header,footer");
|
||||
this.put("header", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("footer", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineForbiddenTags("menu,header,footer");
|
||||
this.put("footer", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("main", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("main", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineForbiddenTags("address");
|
||||
this.put("address", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* The HTML5 Interactive tags (4 total)
|
||||
*/
|
||||
public void interactiveTags(TagInfo tagInfo) {
|
||||
|
||||
tagInfo = new TagInfo("details", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("details", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("summary", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineRequiredEnclosingTags("details");
|
||||
tagInfo.defineForbiddenTags("summary");
|
||||
this.put("summary", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("command", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineForbiddenTags("command");
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("command", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineAllowedChildrenTags("menuitem,li");
|
||||
this.put("menu", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("menuitem", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineRequiredEnclosingTags("menu");
|
||||
this.put("menuitem", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dialog", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("dialog", tagInfo);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* The HTML5 grouping tags (14 total)
|
||||
*/
|
||||
|
||||
public void groupingTags(TagInfo tagInfo) {
|
||||
|
||||
tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("div", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("figure", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("figure", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("figcaption", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineRequiredEnclosingTags("figure");
|
||||
this.put("figcaption", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,summary,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml,time");
|
||||
this.put("p", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("pre", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("dl,"+CLOSE_BEFORE_TAGS);
|
||||
//
|
||||
// This is not correct, but is how most browsers seem to handle
|
||||
// lists. Strictly, only an LI can be a child of a UL or OL
|
||||
//
|
||||
tagInfo.defineAllowedChildrenTags("li,ul,ol,div");
|
||||
//
|
||||
// Where we do have invalid children, we try to insert a LI to make it valid
|
||||
// rather than move out the content.
|
||||
//
|
||||
tagInfo.setPreferredChildTag("li");
|
||||
this.put("ul", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("dl,"+CLOSE_BEFORE_TAGS);
|
||||
//
|
||||
// This is not correct, but is how most browsers seem to handle
|
||||
// lists. Strictly, only an LI can be a child of a UL or OL
|
||||
//
|
||||
tagInfo.defineAllowedChildrenTags("li,ul,ol,div");
|
||||
//
|
||||
// Where we do have invalid children, we try to insert a LI to make it valid
|
||||
// rather than move out the content.
|
||||
//
|
||||
tagInfo.setPreferredChildTag("li");
|
||||
this.put("ol", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("li," + CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineRequiredEnclosingTags("ol,menu,ul");
|
||||
this.put("li", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineAllowedChildrenTags("dt,dd,div,"+SCRIPT_SUPPORTING_TAGS);
|
||||
tagInfo.setPreferredChildTag("div");
|
||||
this.put("dl", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags("dt,dd");
|
||||
tagInfo.defineAllowedChildrenTags(FLOW_TAGS);
|
||||
tagInfo.defineRequiredEnclosingTags("dl");
|
||||
this.put("dt", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags("dt,dd");
|
||||
tagInfo.defineAllowedChildrenTags(FLOW_TAGS);
|
||||
tagInfo.defineRequiredEnclosingTags("dl");
|
||||
this.put("dd", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("hr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("blockquote", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* Html5 phrasing tags --text level semantics (31 total) thelw data
|
||||
*/
|
||||
public void phrasingTags(TagInfo tagInfo) {
|
||||
|
||||
tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("em", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put(STRONG, tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,blink,s");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("small", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,small,blink");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("s", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("a");
|
||||
this.put("a", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.none);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("wbr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("mark", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("mark", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("bdi", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("bdi", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("time", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("time", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("data", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("data", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("cite", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("q", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("code", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
this.put("span", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("bdo", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("dfn", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("kbd", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("abbr", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("var", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("samp", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("br", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sup,small,blink,s");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("sub", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,small,blink,s");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("sup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("u,i,sub,sup,small,blink,s");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("b", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,u,sub,sup,small,blink,s");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("i", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseInsideCopyAfterTags("b,i,sub,sup,small,blink,s");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("u", tagInfo);
|
||||
|
||||
// ---->Html5 Ruby text (added rb,rtc)
|
||||
|
||||
tagInfo = new TagInfo("ruby", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags("rt,rp,rb,rtc");
|
||||
this.put("ruby", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("rtc", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.inline);
|
||||
tagInfo.defineRequiredEnclosingTags("ruby");
|
||||
tagInfo.defineAllowedChildrenTags("rt,"+PHRASING_TAGS);
|
||||
this.put("rtc", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("rb", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.inline);
|
||||
tagInfo.defineRequiredEnclosingTags("ruby");
|
||||
this.put("rb", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("rt", ContentType.text, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.inline);
|
||||
tagInfo.defineRequiredEnclosingTags("ruby");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("rt", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("rp", ContentType.text, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.inline);
|
||||
tagInfo.defineRequiredEnclosingTags("ruby");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("rp", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* Html5 media-embedded tags (12 tags)
|
||||
*/
|
||||
public void mediaTags(TagInfo tagInfo) {
|
||||
|
||||
tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.inline);
|
||||
this.put("img", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
this.put("iframe", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("embed", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("embed", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
this.put("object", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.none);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineRequiredEnclosingTags("object");
|
||||
this.put("param", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("audio", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
|
||||
this.put("audio", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("picture", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
|
||||
this.put("picture", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("video", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
|
||||
this.put("video", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("source", ContentType.none, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.forbidden, Display.any);
|
||||
tagInfo.defineRequiredEnclosingTags("audio,video,object");
|
||||
this.put("source", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("track", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.any);
|
||||
tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
|
||||
this.put("track", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("canvas", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
this.put("canvas", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.none);
|
||||
tagInfo.defineFatalTags("map");
|
||||
tagInfo.defineCloseBeforeTags("area");
|
||||
this.put("area", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags("map");
|
||||
tagInfo.defineAllowedChildrenTags("area");
|
||||
this.put("map", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* The HTML5 edits tags (2 total)
|
||||
*/
|
||||
public void editTags(TagInfo tagInfo) {
|
||||
tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
this.put("ins", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
this.put("del", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* The HTML5 table tags (12 total)
|
||||
*/
|
||||
public void tableTags(TagInfo tagInfo) {
|
||||
|
||||
tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,col,colgroup,caption");
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("table", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tbody");
|
||||
tagInfo.defineAllowedChildrenTags("td,th");
|
||||
//
|
||||
// Where we do have invalid children, we try to insert a TD to make it valid
|
||||
// rather than move out the content.
|
||||
//
|
||||
tagInfo.setPreferredChildTag("td");
|
||||
tagInfo.defineHigherLevelTags("thead,tfoot");
|
||||
tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
|
||||
this.put("tr", tagInfo);
|
||||
|
||||
// jericho parser requires <td></td>
|
||||
tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tr");
|
||||
tagInfo.defineHigherLevelTags("tr");
|
||||
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
|
||||
this.put("td", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineRequiredEnclosingTags("tr");
|
||||
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
|
||||
this.put("th", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("tbody", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("thead", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("tr,form");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("tfoot", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.block);
|
||||
tagInfo.defineFatalTags("colgroup");
|
||||
this.put("col", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineAllowedChildrenTags("col");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("colgroup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineFatalTags("table");
|
||||
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
|
||||
this.put("caption", tagInfo);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* The HTML5 forms tags (15 total)
|
||||
*
|
||||
*/
|
||||
public void formTags(TagInfo tagInfo) {
|
||||
|
||||
tagInfo = new TagInfo("meter", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("meter");
|
||||
this.put("meter", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false,
|
||||
false, true, CloseTag.required, Display.block);
|
||||
tagInfo.defineForbiddenTags("form");
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("form", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("input", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("textarea", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false,
|
||||
false, true, CloseTag.required, Display.inline);
|
||||
tagInfo.defineAllowedChildrenTags("option,optgroup");
|
||||
tagInfo.defineCloseBeforeTags("option,optgroup,select");
|
||||
this.put("select", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("option", ContentType.text, BelongsTo.BODY,
|
||||
false, false, true, CloseTag.optional, Display.inline);
|
||||
tagInfo.defineFatalTags("select,datalist");
|
||||
tagInfo.defineCloseBeforeTags("option");
|
||||
this.put("option", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY,
|
||||
false, false, true, CloseTag.required, Display.inline);
|
||||
tagInfo.defineFatalTags("select");
|
||||
tagInfo.defineAllowedChildrenTags("option");
|
||||
tagInfo.defineCloseBeforeTags("optgroup");
|
||||
this.put("optgroup", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags("select,optgroup,option");
|
||||
this.put("button", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.inline);
|
||||
this.put("label", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineRequiredEnclosingTags("fieldset");
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
this.put("legend", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
|
||||
this.put("fieldset", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("progress", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
|
||||
tagInfo.defineCloseBeforeTags("progress");
|
||||
this.put("progress", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("datalist", ContentType.all, BelongsTo.BODY,
|
||||
false, false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineAllowedChildrenTags("option");
|
||||
tagInfo.defineCloseBeforeTags("datalist");
|
||||
this.put("datalist", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("keygen", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.any);
|
||||
this.put("keygen", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("output", ContentType.all, BelongsTo.BODY, false,
|
||||
false, false, CloseTag.required, Display.any);
|
||||
tagInfo.defineCloseBeforeTags("output," + CLOSE_BEFORE_TAGS);
|
||||
this.put("output", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* HTML5 Document metadata tags
|
||||
*/
|
||||
public void metadataTags(TagInfo tagInfo) {
|
||||
|
||||
// As of HTML5, meta can be used in <body> where it has a @name attribute
|
||||
// TODO add attribute rules
|
||||
tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD_AND_BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("meta", tagInfo);
|
||||
// As of HTML5, link can be used in <body> where it has an @itemprop attribute
|
||||
// TODO add attribute rules
|
||||
tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD_AND_BODY, false,
|
||||
false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("link", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("title", ContentType.text, BelongsTo.HEAD, false,
|
||||
true, false, CloseTag.required, Display.none);
|
||||
this.put("title", tagInfo);
|
||||
|
||||
// Current specification: style can only be used in <head>
|
||||
tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false,
|
||||
false, false, CloseTag.required, Display.none);
|
||||
this.put("style", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false,
|
||||
false, false, CloseTag.forbidden, Display.none);
|
||||
this.put("base", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* HTML5 scripting tags
|
||||
*/
|
||||
public void scriptingTags(TagInfo tagInfo) {
|
||||
tagInfo = new TagInfo("script", ContentType.all,
|
||||
BelongsTo.HEAD_AND_BODY, false, false, false,
|
||||
CloseTag.required, Display.none);
|
||||
this.put("script", tagInfo);
|
||||
|
||||
tagInfo = new TagInfo("noscript", ContentType.all,
|
||||
BelongsTo.HEAD_AND_BODY, false, false, false,
|
||||
CloseTag.required, Display.block);
|
||||
this.put("noscript", tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* It inserts the tag node into the tagInfoMap.
|
||||
*
|
||||
* @param tagName
|
||||
* The name of the tag
|
||||
* @param tagInfo
|
||||
* The info about tag node
|
||||
*/
|
||||
protected void put(String tagName, TagInfo tagInfo) {
|
||||
this.tagInfoMap.put(tagName, tagInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* It returns the tag information.
|
||||
*
|
||||
* @param tagName
|
||||
* The name of the tag to return
|
||||
* @return TagInfo The information about tag node
|
||||
*/
|
||||
public TagInfo getTagInfo(String tagName) {
|
||||
if (tagName == null) {
|
||||
// null named tagNode happens when a html fragment is being dealt
|
||||
// with
|
||||
return null;
|
||||
} else {
|
||||
return this.tagInfoMap.get(tagName.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,62 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
|
||||
/**
|
||||
* <p>General HtmlCleaner runtime exception.</p>
|
||||
*/
|
||||
public class HtmlCleanerException extends RuntimeException {
|
||||
|
||||
public HtmlCleanerException() {
|
||||
this("HtmlCleaner expression occureed!");
|
||||
}
|
||||
|
||||
public HtmlCleanerException(Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
public HtmlCleanerException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public HtmlCleanerException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,354 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import org.apache.tools.ant.BuildException;
|
||||
import org.apache.tools.ant.Task;
|
||||
|
||||
import java.net.URL;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* <p>Support for ANT.</p>
|
||||
*/
|
||||
public class HtmlCleanerForAnt extends Task {
|
||||
|
||||
private String text;
|
||||
private String src;
|
||||
private String dest;
|
||||
private String incharset = CleanerProperties.DEFAULT_CHARSET;
|
||||
private String outcharset = CleanerProperties.DEFAULT_CHARSET;
|
||||
private String taginfofile = null;
|
||||
private String outputtype = "simple";
|
||||
private boolean advancedxmlescape = true;
|
||||
private boolean usecdata = true;
|
||||
private String usecdatafor = "script,style";
|
||||
private boolean specialentities = true;
|
||||
private boolean unicodechars = true;
|
||||
private boolean omitunknowntags = false;
|
||||
private boolean treatunknowntagsascontent = false;
|
||||
private boolean omitdeprtags = false;
|
||||
private boolean treatdeprtagsascontent = false;
|
||||
private boolean omitcomments = false;
|
||||
private boolean omitxmldecl = false;
|
||||
private boolean omitdoctypedecl = true;
|
||||
private boolean omithtmlenvelope = false;
|
||||
private boolean useemptyelementtags = true;
|
||||
private boolean allowmultiwordattributes = true;
|
||||
private boolean allowhtmlinsideattributes = false;
|
||||
private boolean ignoreqe = false;
|
||||
private boolean namespacesaware = true;
|
||||
private String hyphenreplacement = "=";
|
||||
private String prunetags = "";
|
||||
private String booleanatts = CleanerProperties.BOOL_ATT_SELF;
|
||||
private String nodebyxpath = null;
|
||||
|
||||
private String transform = null;
|
||||
|
||||
private boolean allowInvalidAttributeNames = false;
|
||||
private String invalidAttributeNamePrefix = "";
|
||||
|
||||
public void setText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public void setSrc(String src) {
|
||||
this.src = src;
|
||||
}
|
||||
|
||||
public void setDest(String dest) {
|
||||
this.dest = dest;
|
||||
}
|
||||
|
||||
public void setIncharset(String incharset) {
|
||||
this.incharset = incharset;
|
||||
}
|
||||
|
||||
public void setOutcharset(String outcharset) {
|
||||
this.outcharset = outcharset;
|
||||
}
|
||||
|
||||
public void setTaginfofile(String taginfofile) {
|
||||
this.taginfofile = taginfofile;
|
||||
}
|
||||
|
||||
public void setOutputtype(String outputtype) {
|
||||
this.outputtype = outputtype;
|
||||
}
|
||||
|
||||
public void setAdvancedxmlescape(boolean advancedxmlescape) {
|
||||
this.advancedxmlescape = advancedxmlescape;
|
||||
}
|
||||
|
||||
public void setUsecdata(boolean usecdata) {
|
||||
this.usecdata = usecdata;
|
||||
}
|
||||
|
||||
public void setUsecdatafor(String usecdatafor) {
|
||||
this.usecdatafor = usecdatafor;
|
||||
}
|
||||
|
||||
public void setSpecialentities(boolean specialentities) {
|
||||
this.specialentities = specialentities;
|
||||
}
|
||||
|
||||
public void setUnicodechars(boolean unicodechars) {
|
||||
this.unicodechars = unicodechars;
|
||||
}
|
||||
|
||||
public void setOmitunknowntags(boolean omitunknowntags) {
|
||||
this.omitunknowntags = omitunknowntags;
|
||||
}
|
||||
|
||||
public void setTreatunknowntagsascontent(boolean treatunknowntagsascontent) {
|
||||
this.treatunknowntagsascontent = treatunknowntagsascontent;
|
||||
}
|
||||
|
||||
public void setOmitdeprtags(boolean omitdeprtags) {
|
||||
this.omitdeprtags = omitdeprtags;
|
||||
}
|
||||
|
||||
|
||||
public void setTreatdeprtagsascontent(boolean treatdeprtagsascontent) {
|
||||
this.treatdeprtagsascontent = treatdeprtagsascontent;
|
||||
}
|
||||
|
||||
public void setOmitcomments(boolean omitcomments) {
|
||||
this.omitcomments = omitcomments;
|
||||
}
|
||||
|
||||
public void setOmitxmldecl(boolean omitxmldecl) {
|
||||
this.omitxmldecl = omitxmldecl;
|
||||
}
|
||||
|
||||
public void setOmitdoctypedecl(boolean omitdoctypedecl) {
|
||||
this.omitdoctypedecl = omitdoctypedecl;
|
||||
}
|
||||
|
||||
public void setOmithtmlenvelope(boolean omithtmlenvelope) {
|
||||
this.omithtmlenvelope = omithtmlenvelope;
|
||||
}
|
||||
|
||||
public void setUseemptyelementtags(boolean useemptyelementtags) {
|
||||
this.useemptyelementtags = useemptyelementtags;
|
||||
}
|
||||
|
||||
public void setAllowmultiwordattributes(boolean allowmultiwordattributes) {
|
||||
this.allowmultiwordattributes = allowmultiwordattributes;
|
||||
}
|
||||
|
||||
public void setAllowhtmlinsideattributes(boolean allowhtmlinsideattributes) {
|
||||
this.allowhtmlinsideattributes = allowhtmlinsideattributes;
|
||||
}
|
||||
|
||||
public void setIgnoreqe(boolean ignoreqe) {
|
||||
this.ignoreqe = ignoreqe;
|
||||
}
|
||||
|
||||
public void setNamespacesaware(boolean namespacesaware) {
|
||||
this.namespacesaware = namespacesaware;
|
||||
}
|
||||
|
||||
public void setHyphenreplacement(String hyphenreplacement) {
|
||||
this.hyphenreplacement = hyphenreplacement;
|
||||
}
|
||||
|
||||
public void setPrunetags(String prunetags) {
|
||||
this.prunetags = prunetags;
|
||||
}
|
||||
|
||||
public void setBooleanatts(String booleanatts) {
|
||||
this.booleanatts = booleanatts;
|
||||
}
|
||||
|
||||
public void setNodebyxpath(String nodebyxpath) {
|
||||
this.nodebyxpath = nodebyxpath;
|
||||
}
|
||||
|
||||
public void setTransform(String transform) {
|
||||
this.transform = transform;
|
||||
}
|
||||
|
||||
public void addText(String text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of Ant task execution.
|
||||
* @throws BuildException
|
||||
*/
|
||||
@Override
|
||||
public void execute() throws BuildException {
|
||||
HtmlCleaner cleaner;
|
||||
|
||||
if ( this.taginfofile != null ) {
|
||||
cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(this.taginfofile)));
|
||||
} else {
|
||||
cleaner = new HtmlCleaner();
|
||||
}
|
||||
|
||||
if (text == null && src == null) {
|
||||
throw new BuildException("Eather attribute 'src' or text body containing HTML must be specified!");
|
||||
}
|
||||
|
||||
CleanerProperties props = cleaner.getProperties();
|
||||
|
||||
props.setAdvancedXmlEscape(this.advancedxmlescape);
|
||||
props.setUseCdataFor(this.usecdatafor);
|
||||
props.setUseCdataForScriptAndStyle(this.usecdata);
|
||||
props.setTranslateSpecialEntities(this.specialentities);
|
||||
props.setRecognizeUnicodeChars(this.unicodechars);
|
||||
props.setOmitUnknownTags(this.omitunknowntags);
|
||||
props.setTreatUnknownTagsAsContent(this.treatunknowntagsascontent);
|
||||
props.setOmitDeprecatedTags(this.omitdeprtags);
|
||||
props.setTreatDeprecatedTagsAsContent(this.treatdeprtagsascontent);
|
||||
props.setOmitComments(this.omitcomments);
|
||||
props.setOmitXmlDeclaration(this.omitxmldecl);
|
||||
props.setOmitDoctypeDeclaration(this.omitdoctypedecl);
|
||||
props.setOmitHtmlEnvelope(this.omithtmlenvelope);
|
||||
props.setUseEmptyElementTags(this.useemptyelementtags);
|
||||
props.setAllowMultiWordAttributes(this.allowmultiwordattributes);
|
||||
props.setAllowHtmlInsideAttributes(this.allowhtmlinsideattributes);
|
||||
props.setIgnoreQuestAndExclam(this.ignoreqe);
|
||||
props.setNamespacesAware(this.namespacesaware);
|
||||
props.setHyphenReplacementInComment(this.hyphenreplacement);
|
||||
props.setPruneTags(this.prunetags);
|
||||
props.setBooleanAttributeValues(this.booleanatts);
|
||||
props.setAllowInvalidAttributeNames(this.allowInvalidAttributeNames);
|
||||
props.setInvalidXmlAttributeNamePrefix(this.invalidAttributeNamePrefix);
|
||||
|
||||
// set cleaner transformation if specified in "transform" attribute
|
||||
// format of attribute is expected to be <transkey1>[=<transvalue1>]|<transkey2>[=<transvalue2>...
|
||||
// (separator is pipe character)
|
||||
if ( !Utils.isEmptyString(transform) ) {
|
||||
String[] transItems = Utils.tokenize(transform, "|");
|
||||
Map transInfos = new TreeMap();
|
||||
for (String item : transItems) {
|
||||
int index = item.indexOf('=');
|
||||
String key = index <= 0 ? item : item.substring(0, index);
|
||||
String value = index <= 0 ? null : item.substring(index + 1);
|
||||
transInfos.put(key, value);
|
||||
}
|
||||
|
||||
cleaner.initCleanerTransformations(transInfos);
|
||||
}
|
||||
|
||||
try {
|
||||
TagNode node;
|
||||
try {
|
||||
if ( src != null && (src.startsWith("http://") || src.startsWith("https://")) ) {
|
||||
node = cleaner.clean(new URL(src), incharset);
|
||||
} else if (src != null) {
|
||||
node = cleaner.clean(new File(src), incharset);
|
||||
} else {
|
||||
node = cleaner.clean(text);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new BuildException(e);
|
||||
}
|
||||
|
||||
// if user specifies XPath expresssion to choose node for serialization, then
|
||||
// try to evaluate XPath and look for first TagNode instance in the resulting array
|
||||
if ( nodebyxpath != null ) {
|
||||
final Object[] xpathResult = node.evaluateXPath(nodebyxpath);
|
||||
for (Object element : xpathResult) {
|
||||
if ( element instanceof TagNode ) {
|
||||
node = (TagNode) element;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
OutputStream out;
|
||||
|
||||
String antPropertyName = "";
|
||||
|
||||
if ( dest == null || "".equals(dest.trim()) ) {
|
||||
out = System.out;
|
||||
} else if ( dest.startsWith("property:") ) {
|
||||
out = new ByteArrayOutputStream();
|
||||
antPropertyName = dest.substring(dest.indexOf(':') + 1);
|
||||
getProject().log("Setting property " + antPropertyName);
|
||||
} else {
|
||||
out = new FileOutputStream(dest);
|
||||
}
|
||||
|
||||
if ( "compact".equals(outputtype) ) {
|
||||
new CompactXmlSerializer(props).writeToStream(node, out, outcharset);
|
||||
} else if ( "browser-compact".equals(outputtype) ) {
|
||||
new BrowserCompactXmlSerializer(props).writeToStream(node, out, outcharset);
|
||||
} else if ( "pretty".equals(outputtype) ) {
|
||||
new PrettyXmlSerializer(props).writeToStream(node, out, outcharset);
|
||||
} else {
|
||||
new SimpleXmlSerializer(props).writeToStream(node, out, outcharset);
|
||||
}
|
||||
|
||||
if ( antPropertyName != null && antPropertyName.length() > 0 ) {
|
||||
getProject().setNewProperty(antPropertyName, out.toString());
|
||||
}
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new BuildException(e);
|
||||
} catch (XPatherException e) {
|
||||
throw new BuildException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isAllowInvalidAttributeNames() {
|
||||
return allowInvalidAttributeNames;
|
||||
}
|
||||
|
||||
public void setAllowInvalidAttributeNames(boolean allowInvalidAttributeNames) {
|
||||
this.allowInvalidAttributeNames = allowInvalidAttributeNames;
|
||||
}
|
||||
|
||||
public String getInvalidAttributeNamePrefix() {
|
||||
return invalidAttributeNamePrefix;
|
||||
}
|
||||
|
||||
public void setInvalidAttributeNamePrefix(String invalidAttributeNamePrefix) {
|
||||
this.invalidAttributeNamePrefix = invalidAttributeNamePrefix;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Marker interface denoting nodes of the document tree
|
||||
*/
|
||||
public interface HtmlNode extends BaseToken {
|
||||
|
||||
public List<? extends BaseToken> getSiblings();
|
||||
|
||||
public TagNode getParent();
|
||||
|
||||
public void setParent(TagNode parent);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,141 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Abstract HTML serializer - contains common logic for descendants.</p>
|
||||
*/
|
||||
public abstract class HtmlSerializer extends Serializer {
|
||||
|
||||
protected HtmlSerializer(CleanerProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
|
||||
protected boolean isMinimizedTagSyntax(TagNode tagNode) {
|
||||
final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
|
||||
return tagInfo != null && !tagNode.hasChildren() && tagInfo.isEmptyTag();
|
||||
}
|
||||
|
||||
protected boolean dontEscape(TagNode tagNode) {
|
||||
return isScriptOrStyle(tagNode);
|
||||
}
|
||||
|
||||
protected String escapeText(String content) {
|
||||
return Utils.escapeHtml(content, props);
|
||||
}
|
||||
|
||||
protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
|
||||
String tagName = tagNode.getName();
|
||||
|
||||
if (Utils.isEmptyString(tagName)) {
|
||||
return;
|
||||
}
|
||||
|
||||
boolean nsAware = props.isNamespacesAware();
|
||||
|
||||
if (!nsAware && Utils.getXmlNSPrefix(tagName) != null ) {
|
||||
tagName = Utils.getXmlName(tagName);
|
||||
}
|
||||
|
||||
writer.write("<" + tagName);
|
||||
for (Map.Entry<String, String> entry: tagNode.getAttributes().entrySet()) {
|
||||
String attName = entry.getKey();
|
||||
String attValue = entry.getValue();
|
||||
attValue = Utils.deserializeEntities(attValue, props.isRecognizeUnicodeChars());
|
||||
|
||||
//
|
||||
// Note that because we implemented the WHATWG attribute identifier rules
|
||||
// during the tokenize stage, we'll never have invalid attribute names at
|
||||
// this point.
|
||||
//
|
||||
if (attName != null){
|
||||
|
||||
if (!nsAware && Utils.getXmlNSPrefix(attName) != null ) {
|
||||
attName = Utils.getXmlName(attName);
|
||||
}
|
||||
if (!(nsAware && attName.equalsIgnoreCase("xmlns")))
|
||||
writer.write(" " + attName + "=\"" + escapeText(attValue) + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
if (nsAware) {
|
||||
Map<String, String> nsDeclarations = tagNode.getNamespaceDeclarations();
|
||||
if (nsDeclarations != null) {
|
||||
for (Map.Entry<String, String> entry: nsDeclarations.entrySet()) {
|
||||
String prefix = entry.getKey();
|
||||
String att = "xmlns";
|
||||
if (prefix.length() > 0) {
|
||||
att += ":" + prefix;
|
||||
}
|
||||
writer.write(" " + att + "=\"" + escapeText(entry.getValue()) + "\"");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( isMinimizedTagSyntax(tagNode) ) {
|
||||
writer.write(" />");
|
||||
if (newLine) {
|
||||
writer.write("\n");
|
||||
}
|
||||
} else {
|
||||
writer.write(">");
|
||||
}
|
||||
}
|
||||
|
||||
protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
|
||||
String tagName = tagNode.getName();
|
||||
|
||||
if (Utils.isEmptyString(tagName)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) {
|
||||
tagName = Utils.getXmlName(tagName);
|
||||
}
|
||||
|
||||
writer.write( "</" + tagName + ">" );
|
||||
if (newLine) {
|
||||
writer.write("\n");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,52 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Provides set of TagInfo instances. The instance of this interface is used as a
|
||||
* collection of tag definitions used in cleanup process. Implementing this interface
|
||||
* desired behaviour of cleaner can be achived.<br/>
|
||||
* In most cases implementation will be or contain a kind of Map.
|
||||
* </p>
|
||||
*/
|
||||
public interface ITagInfoProvider {
|
||||
|
||||
public TagInfo getTagInfo(String tagName);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,254 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.jdom2.CDATA;
|
||||
import org.jdom2.Comment;
|
||||
import org.jdom2.DefaultJDOMFactory;
|
||||
import org.jdom2.Document;
|
||||
import org.jdom2.Element;
|
||||
import org.jdom2.Namespace;
|
||||
import org.jdom2.Text;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* JDom serializer - creates xml JDom instance out of the TagNode.
|
||||
* </p>
|
||||
*/
|
||||
public class JDomSerializer {
|
||||
|
||||
private static final String CSS_COMMENT_START = "/*";
|
||||
|
||||
private static final String CSS_COMMENT_END = "*/";
|
||||
|
||||
private static final String NEW_LINE = "\n";
|
||||
|
||||
private DefaultJDOMFactory factory;
|
||||
|
||||
protected CleanerProperties props;
|
||||
protected boolean escapeXml = true;
|
||||
|
||||
public JDomSerializer(CleanerProperties props, boolean escapeXml) {
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
}
|
||||
|
||||
public JDomSerializer(CleanerProperties props) {
|
||||
this(props, true);
|
||||
}
|
||||
|
||||
public Document createJDom(TagNode rootNode) {
|
||||
this.factory = new DefaultJDOMFactory();
|
||||
|
||||
//
|
||||
// If there is no actual root node then return nothing
|
||||
//
|
||||
if (rootNode.getName() == null) return null;
|
||||
|
||||
Element rootElement = createElement(rootNode);
|
||||
Document document = this.factory.document(rootElement);
|
||||
|
||||
setAttributes(rootNode, rootElement);
|
||||
|
||||
createSubnodes(rootElement, rootNode.getAllChildren());
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
private Element createElement(TagNode node) {
|
||||
String name = node.getName();
|
||||
//
|
||||
// XML element names are more strict in their definition
|
||||
// than HTML tag identifiers.
|
||||
// See https://www.w3.org/TR/xml/#NT-Name
|
||||
// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
//
|
||||
name = Utils.sanitizeXmlIdentifier(name);
|
||||
|
||||
boolean nsAware = props.isNamespacesAware();
|
||||
String prefix = Utils.getXmlNSPrefix(name);
|
||||
Map<String, String> nsDeclarations = node.getNamespaceDeclarations();
|
||||
String nsURI = null;
|
||||
if (prefix != null) {
|
||||
name = Utils.getXmlName(name);
|
||||
if (nsAware) {
|
||||
if (nsDeclarations != null) {
|
||||
nsURI = nsDeclarations.get(prefix);
|
||||
}
|
||||
if (nsURI == null) {
|
||||
nsURI = node.getNamespaceURIOnPath(prefix);
|
||||
}
|
||||
if (nsURI == null) {
|
||||
nsURI = prefix;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (nsAware) {
|
||||
if (nsDeclarations != null) {
|
||||
nsURI = nsDeclarations.get("");
|
||||
}
|
||||
if (nsURI == null) {
|
||||
nsURI = node.getNamespaceURIOnPath(prefix);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Element element;
|
||||
if (nsAware && nsURI != null) {
|
||||
Namespace ns = prefix == null ? Namespace.getNamespace(nsURI) : Namespace.getNamespace(prefix, nsURI);
|
||||
element = factory.element(name, ns);
|
||||
} else {
|
||||
element = factory.element(name);
|
||||
}
|
||||
|
||||
if (nsAware) {
|
||||
defineNamespaceDeclarations(node, element);
|
||||
}
|
||||
return element;
|
||||
}
|
||||
|
||||
private void defineNamespaceDeclarations(TagNode node, Element element) {
|
||||
Map<String, String> nsDeclarations = node.getNamespaceDeclarations();
|
||||
if (nsDeclarations != null) {
|
||||
for (Map.Entry<String, String> nsEntry : nsDeclarations.entrySet()) {
|
||||
String nsPrefix = nsEntry.getKey();
|
||||
String nsURI = nsEntry.getValue();
|
||||
Namespace ns = nsPrefix == null || "".equals(nsPrefix) ? Namespace.getNamespace(nsURI) : Namespace
|
||||
.getNamespace(nsPrefix, nsURI);
|
||||
element.addNamespaceDeclaration(ns);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setAttributes(TagNode node, Element element) {
|
||||
for (Map.Entry<String, String> entry : node.getAttributes().entrySet()) {
|
||||
String attrName = entry.getKey();
|
||||
String attrValue = entry.getValue();
|
||||
if (escapeXml) {
|
||||
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
|
||||
attrValue = Utils.escapeXml(attrValue, props, true);
|
||||
}
|
||||
|
||||
//
|
||||
// Fix any invalid attribute names
|
||||
//
|
||||
if (!props.isAllowInvalidAttributeNames()){
|
||||
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix(),"");
|
||||
}
|
||||
|
||||
//
|
||||
// Note that even if we did want to allow invalid attribute names, JDom won't allow it
|
||||
//
|
||||
if (attrName != null && Utils.isValidXmlIdentifier(attrName)){
|
||||
String attPrefix = Utils.getXmlNSPrefix(attrName);
|
||||
Namespace ns = null;
|
||||
if (attPrefix != null) {
|
||||
attrName = Utils.getXmlName(attrName);
|
||||
if (props.isNamespacesAware()) {
|
||||
String nsURI = node.getNamespaceURIOnPath(attPrefix);
|
||||
if (nsURI == null) {
|
||||
nsURI = attPrefix;
|
||||
}
|
||||
if (!attPrefix.startsWith("xml")) {
|
||||
ns = Namespace.getNamespace(attPrefix, nsURI);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Don't manually add xmlns attributes as these should be
|
||||
// handled automatically by JDOM through the namespace
|
||||
// mechanism
|
||||
//
|
||||
if (!attrName.equals("xmlns")){
|
||||
if (ns == null) {
|
||||
element.setAttribute(attrName, attrValue);
|
||||
} else {
|
||||
element.setAttribute(attrName, attrValue, ns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void createSubnodes(Element element, List<? extends BaseToken> tagChildren) {
|
||||
if (tagChildren != null) {
|
||||
|
||||
CDATA cdata = null;
|
||||
//
|
||||
// For script and style nodes, check if we're set to use CDATA
|
||||
//
|
||||
if (props.isUseCdataFor(element.getName())){
|
||||
cdata = factory.cdata("");
|
||||
element.addContent(factory.text(CSS_COMMENT_START));
|
||||
element.addContent(cdata);
|
||||
}
|
||||
|
||||
|
||||
Iterator<? extends BaseToken> it = tagChildren.iterator();
|
||||
while (it.hasNext()) {
|
||||
|
||||
Object item = it.next();
|
||||
|
||||
if (item instanceof CommentNode) {
|
||||
CommentNode commentNode = (CommentNode) item;
|
||||
Comment comment = factory.comment(commentNode.getContent().toString());
|
||||
element.addContent(comment);
|
||||
|
||||
} else if (item instanceof ContentNode) {
|
||||
String nodeName = element.getName();
|
||||
String content = item.toString();
|
||||
boolean specialCase = props.isUseCdataFor(nodeName);
|
||||
|
||||
if (escapeXml && !specialCase) {
|
||||
content = Utils.escapeXml(content, props, true);
|
||||
}
|
||||
if (specialCase && item instanceof CData){
|
||||
//
|
||||
// For CDATA sections we don't want to return the start and
|
||||
// end tokens. See issue #106.
|
||||
//
|
||||
content = ((CData)item).getContentWithoutStartAndEndTokens();
|
||||
}
|
||||
if (cdata != null){
|
||||
cdata.append(content);
|
||||
} else {
|
||||
Text text = factory.text(content);
|
||||
element.addContent(text);
|
||||
}
|
||||
|
||||
} else if (item instanceof TagNode) {
|
||||
TagNode subTagNode = (TagNode) item;
|
||||
Element subelement = createElement(subTagNode);
|
||||
|
||||
setAttributes(subTagNode, subelement);
|
||||
|
||||
// recursively create subnodes
|
||||
createSubnodes(subelement, subTagNode.getAllChildren());
|
||||
|
||||
element.addContent(subelement);
|
||||
} else if (item instanceof List) {
|
||||
List sublist = (List) item;
|
||||
createSubnodes(element, sublist);
|
||||
}
|
||||
|
||||
}
|
||||
if (cdata != null){
|
||||
if (!cdata.getText().startsWith(NEW_LINE)){
|
||||
cdata.setText(CSS_COMMENT_END + NEW_LINE + cdata.getText());
|
||||
} else {
|
||||
cdata.setText(CSS_COMMENT_END + cdata.getText());
|
||||
}
|
||||
if (!cdata.getText().endsWith(NEW_LINE)){
|
||||
|
||||
cdata.append(NEW_LINE);
|
||||
}
|
||||
cdata.append(CSS_COMMENT_START);
|
||||
element.addContent(factory.text(CSS_COMMENT_END));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,185 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
/**It contains the MathML tags to use with Html5 tags
|
||||
*
|
||||
* @author User
|
||||
*
|
||||
*/
|
||||
public class MathMLTagProvider {
|
||||
|
||||
private static final String CLOSE_BEFORE_TAGS = "menclose,mpadded,mphantom,mfenced,mstyle,merror,msqrt,mroot,maligngroup,malignmark,mlabeledtr,ms,mi,mo,mn,mfrac,mtext,mspace,mglyph,p,details,summary,menuitem,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
|
||||
|
||||
public MathMLTagProvider(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap) {
|
||||
presentationMarkup(tagInfo,tagInfoMap);
|
||||
}
|
||||
|
||||
public void presentationMarkup(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
|
||||
tokenElements(tagInfo,tagInfoMap);
|
||||
layoutElements(tagInfo,tagInfoMap);
|
||||
scriptElements(tagInfo,tagInfoMap);
|
||||
tableElements(tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("maction", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("maction", tagInfo,tagInfoMap);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void tokenElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
|
||||
tagInfo = new TagInfo("mi", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mi", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mn", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mo", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mtext", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mtext", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mspace", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mspace", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("ms", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("ms", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mglyph", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mglyph", tagInfo,tagInfoMap);
|
||||
}
|
||||
|
||||
|
||||
public void layoutElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
|
||||
|
||||
tagInfo = new TagInfo("mrow", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mrow", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mfrac", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mfrac", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("msqrt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("msqrt", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mroot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mroot", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mstyle", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mstyle", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("merror", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("merror", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mpadded", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mpadded", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mphantom", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mphantom", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mfenced", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mfenced", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("menclose", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("menclose", tagInfo,tagInfoMap);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void scriptElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
|
||||
tagInfo = new TagInfo("msub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("msub", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("msup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("msup", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("msubsup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("msubsup", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("munder", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("munder", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mover", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mover", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("munderover", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("munderover", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mmultiscripts", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("mmultiscripts", tagInfo,tagInfoMap);
|
||||
|
||||
}
|
||||
|
||||
public void tableElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
|
||||
tagInfo = new TagInfo("mtable", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineAllowedChildrenTags("mtr,mtd,mo,mn,mlabeledtr");
|
||||
this.put("mtable", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mlabeledtr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineRequiredEnclosingTags("mtable");
|
||||
tagInfo.defineFatalTags("mtable");
|
||||
this.put("mlabeledtr", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mtr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
tagInfo.defineAllowedChildrenTags("mtd,mlabeledtr");
|
||||
//tagInfo.defineRequiredEnclosingTags("mtable");
|
||||
this.put("mtr", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("mtd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
//tagInfo.defineRequiredEnclosingTags("mtr");
|
||||
//tagInfo.defineFatalTags("mtable");
|
||||
this.put("mtd", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("maligngroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("maligngroup", tagInfo,tagInfoMap);
|
||||
|
||||
tagInfo = new TagInfo("malignmark", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
|
||||
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
|
||||
this.put("malignmark", tagInfo,tagInfoMap);
|
||||
|
||||
}
|
||||
|
||||
|
||||
protected void put(String tagName, TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap) {
|
||||
tagInfoMap.put(tagName, tagInfo);
|
||||
}
|
||||
|
||||
public TagInfo getTagInfo(String tagName,ConcurrentMap<String, TagInfo> tagInfoMap) {
|
||||
if ( tagName == null) {
|
||||
return null;
|
||||
} else {
|
||||
return tagInfoMap.get(tagName);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
|
||||
/**
|
||||
* Nesting State
|
||||
* Wrapper for a current HtmlCleaner cleaning state, keeping together
|
||||
* the set of open tags and breaks in the current state.
|
||||
* @author scottw
|
||||
*/
|
||||
class NestingState {
|
||||
|
||||
private OpenTags openTags;
|
||||
private ChildBreaks childBreaks;
|
||||
|
||||
public NestingState(OpenTags openTags, ChildBreaks childBreaks) {
|
||||
this.openTags = openTags;
|
||||
this.childBreaks = childBreaks;
|
||||
}
|
||||
|
||||
public OpenTags getOpenTags() {
|
||||
return this.openTags;
|
||||
}
|
||||
public ChildBreaks getChildBreaks() {
|
||||
return this.childBreaks;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Class that contains information and methods for managing list of open,
|
||||
* but unhandled tags.
|
||||
*/
|
||||
class OpenTags {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private final HtmlCleaner htmlCleaner;
|
||||
|
||||
/**
|
||||
* @param htmlCleaner
|
||||
*/
|
||||
OpenTags(HtmlCleaner htmlCleaner) {
|
||||
this.htmlCleaner = htmlCleaner;
|
||||
}
|
||||
|
||||
List<TagPos> list = new ArrayList<TagPos>();
|
||||
private TagPos last;
|
||||
private Set<String> set = new HashSet<String>();
|
||||
|
||||
boolean isEmpty() {
|
||||
return list.isEmpty();
|
||||
}
|
||||
|
||||
void addTag(String tagName, TagInfo tagInfo, int position, CleanTimeValues cleanTimeValues) {
|
||||
last = new TagPos(position, tagName, tagInfo, cleanTimeValues);
|
||||
list.add(last);
|
||||
set.add(tagName);
|
||||
}
|
||||
|
||||
void removeTag(String tagName) {
|
||||
ListIterator<TagPos> it = list.listIterator( list.size() );
|
||||
while ( it.hasPrevious() ) {
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
this.htmlCleaner.handleInterruption();
|
||||
break;
|
||||
}
|
||||
TagPos currTagPos = it.previous();
|
||||
if (tagName.equals(currTagPos.name)) {
|
||||
it.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
last = list.isEmpty() ? null : (TagPos) list.get( list.size() - 1 );
|
||||
}
|
||||
|
||||
TagPos findFirstTagPos() {
|
||||
return list.isEmpty() ? null : (TagPos) list.get(0);
|
||||
}
|
||||
|
||||
TagPos getLastTagPos() {
|
||||
return last;
|
||||
}
|
||||
|
||||
TagPos findTag(String tagName, CleanTimeValues cleanTimeValues) {
|
||||
if (tagName != null) {
|
||||
ListIterator<TagPos> it = list.listIterator(list.size());
|
||||
String fatalTag = null;
|
||||
TagInfo fatalInfo = this.htmlCleaner.getTagInfo(tagName, cleanTimeValues);
|
||||
|
||||
while (it.hasPrevious()) {
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
this.htmlCleaner.handleInterruption();
|
||||
return null;
|
||||
}
|
||||
TagPos currTagPos = it.previous();
|
||||
if (tagName.equals(currTagPos.name)) {
|
||||
return currTagPos;
|
||||
} else if (fatalInfo != null && fatalInfo.isFatalTag(currTagPos.name)) {
|
||||
// do not search past a fatal tag for this tag
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
boolean tagExists(String tagName, CleanTimeValues cleanTimeValues) {
|
||||
TagPos tagPos = findTag(tagName, cleanTimeValues);
|
||||
return tagPos != null;
|
||||
}
|
||||
|
||||
TagPos findTagToPlaceRubbish() {
|
||||
TagPos result = null, prev = null;
|
||||
|
||||
if ( !isEmpty() ) {
|
||||
ListIterator<TagPos> it = list.listIterator( list.size() );
|
||||
while ( it.hasPrevious() ) {
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
this.htmlCleaner.handleInterruption();
|
||||
return null;
|
||||
}
|
||||
result = it.previous();
|
||||
if ( result.info == null || result.info.allowsAnything() ) {
|
||||
if (prev != null) {
|
||||
return prev;
|
||||
}
|
||||
}
|
||||
prev = result;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
boolean tagEncountered(String tagName) {
|
||||
return set.contains(tagName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if any of tags specified in the set are already open.
|
||||
* @param tags
|
||||
*/
|
||||
boolean someAlreadyOpen(Set<String> tags) {
|
||||
for (TagPos curr : list) {
|
||||
if ( tags.contains(curr.name) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
*/
|
||||
public enum OptionalOutput {
|
||||
/**
|
||||
* Never outputed even if supplied in the source.
|
||||
*/
|
||||
omit,
|
||||
/**
|
||||
* outputed ONLY if supplied in the source.
|
||||
*/
|
||||
preserve,
|
||||
/**
|
||||
* Always outputed, if information is not supplied in the source a default is created.
|
||||
*/
|
||||
alwaysOutput;
|
||||
}
|
||||
@@ -0,0 +1,221 @@
|
||||
/* Copyright (c) 2006-2013, HtmlCleaner project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Pretty HTML serializer - creates resulting HTML with indenting lines.</p>
|
||||
*/
|
||||
public class PrettyHtmlSerializer extends HtmlSerializer {
|
||||
|
||||
private static final String DEFAULT_INDENTATION_STRING = "\t";
|
||||
|
||||
private String indentString = DEFAULT_INDENTATION_STRING;
|
||||
private List<String> indents = new ArrayList<String>();
|
||||
|
||||
public PrettyHtmlSerializer(CleanerProperties props) {
|
||||
this(props, DEFAULT_INDENTATION_STRING);
|
||||
}
|
||||
|
||||
public PrettyHtmlSerializer(CleanerProperties props, String indentString) {
|
||||
super(props);
|
||||
this.indentString = indentString;
|
||||
}
|
||||
|
||||
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializePrettyHtml(tagNode, writer, 0, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param level
|
||||
* @return Appropriate indentation for the specified depth.
|
||||
*/
|
||||
private synchronized String getIndent(int level) {
|
||||
int size = indents.size();
|
||||
if (size <= level) {
|
||||
String prevIndent = size == 0 ? null : indents.get(size - 1);
|
||||
for (int i = size; i <= level; i++) {
|
||||
String currIndent = prevIndent == null ? "" : prevIndent + indentString;
|
||||
indents.add(currIndent);
|
||||
prevIndent = currIndent;
|
||||
}
|
||||
}
|
||||
|
||||
return indents.get(level);
|
||||
}
|
||||
|
||||
private String getIndentedText(String content, int level) {
|
||||
String indent = getIndent(level);
|
||||
StringBuilder result = new StringBuilder( content.length() );
|
||||
StringTokenizer tokenizer = new StringTokenizer(content, "\n\r");
|
||||
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String line = tokenizer.nextToken().trim();
|
||||
if (!"".equals(line)) {
|
||||
result.append(indent).append(line).append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private String getSingleLineOfChildren(List<? extends BaseToken> children) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
Iterator<? extends BaseToken> childrenIt = children.iterator();
|
||||
boolean isFirst = true;
|
||||
|
||||
while (childrenIt.hasNext()) {
|
||||
Object child = childrenIt.next();
|
||||
|
||||
if ( !(child instanceof ContentNode) ) {
|
||||
return null;
|
||||
} else {
|
||||
String content = child.toString();
|
||||
|
||||
//
|
||||
// Removed the trim function as this has the potential
|
||||
// to cause issues with actual content without adding
|
||||
// any value
|
||||
//
|
||||
|
||||
/*
|
||||
// if first item trims it from left
|
||||
if (isFirst) {
|
||||
content = Utils.ltrim(content);
|
||||
}
|
||||
|
||||
// if last item trims it from right
|
||||
if (!childrenIt.hasNext()) {
|
||||
content = Utils.rtrim(content);
|
||||
}
|
||||
*/
|
||||
|
||||
if ( content.indexOf("\n") >= 0 || content.indexOf("\r") >= 0 ) {
|
||||
return null;
|
||||
}
|
||||
result.append(content);
|
||||
}
|
||||
|
||||
isFirst = false;
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
protected void serializePrettyHtml(TagNode tagNode, Writer writer, int level, boolean isPreserveWhitespaces, boolean isLastNewLine) throws IOException {
|
||||
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
|
||||
String tagName = tagNode.getName();
|
||||
boolean isHeadlessNode = Utils.isEmptyString(tagName);
|
||||
String indent = isHeadlessNode ? "" : getIndent(level);
|
||||
|
||||
if (!isPreserveWhitespaces) {
|
||||
if (!isLastNewLine) {
|
||||
writer.write("\n");
|
||||
}
|
||||
writer.write(indent);
|
||||
}
|
||||
serializeOpenTag(tagNode, writer, true);
|
||||
|
||||
boolean preserveWhitespaces = isPreserveWhitespaces || "pre".equalsIgnoreCase(tagName);
|
||||
|
||||
boolean lastWasNewLine = false;
|
||||
|
||||
if ( !isMinimizedTagSyntax(tagNode) ) {
|
||||
String singleLine = getSingleLineOfChildren(tagChildren);
|
||||
boolean dontEscape = dontEscape(tagNode);
|
||||
if (!preserveWhitespaces && singleLine != null) {
|
||||
writer.write( !dontEscape(tagNode) ? escapeText(singleLine) : singleLine );
|
||||
} else {
|
||||
Iterator<? extends BaseToken> childIterator = tagChildren.iterator();
|
||||
while (childIterator.hasNext()) {
|
||||
Object child = childIterator.next();
|
||||
if (child instanceof TagNode) {
|
||||
serializePrettyHtml((TagNode)child, writer, isHeadlessNode ? level : level + 1, preserveWhitespaces, lastWasNewLine);
|
||||
lastWasNewLine = false;
|
||||
} else if (child instanceof ContentNode) {
|
||||
String content = dontEscape ? child.toString() : escapeText(child.toString());
|
||||
if (content.length() > 0) {
|
||||
if (dontEscape || preserveWhitespaces) {
|
||||
writer.write(content);
|
||||
} else if (Character.isWhitespace(content.charAt(0))) {
|
||||
if (!lastWasNewLine) {
|
||||
writer.write("\n");
|
||||
lastWasNewLine = false;
|
||||
}
|
||||
if (content.trim().length() > 0) {
|
||||
writer.write( getIndentedText(Utils.rtrim(content), isHeadlessNode ? level : level + 1) );
|
||||
} else {
|
||||
lastWasNewLine = true;
|
||||
}
|
||||
} else {
|
||||
if (content.trim().length() > 0) {
|
||||
writer.write(Utils.rtrim(content));
|
||||
}
|
||||
if (!childIterator.hasNext()) {
|
||||
writer.write("\n");
|
||||
lastWasNewLine = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (child instanceof CommentNode) {
|
||||
|
||||
if (!lastWasNewLine && !preserveWhitespaces) {
|
||||
writer.write("\n");
|
||||
lastWasNewLine = false;
|
||||
}
|
||||
CommentNode commentNode = (CommentNode) child;
|
||||
String content = commentNode.getCommentedContent();
|
||||
writer.write( dontEscape ? content : getIndentedText(content, isHeadlessNode ? level : level + 1) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (singleLine == null && !preserveWhitespaces) {
|
||||
if (!lastWasNewLine) {
|
||||
writer.write("\n");
|
||||
}
|
||||
writer.write(indent);
|
||||
}
|
||||
|
||||
serializeEndTag(tagNode, writer, false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Pretty XML serializer - creates resulting XML with indenting lines.</p>
|
||||
*/
|
||||
public class PrettyXmlSerializer extends XmlSerializer {
|
||||
|
||||
private static final String DEFAULT_INDENTATION_STRING = "\t";
|
||||
|
||||
private String indentString = DEFAULT_INDENTATION_STRING;
|
||||
private List<String> indents = new ArrayList<String>();
|
||||
|
||||
public PrettyXmlSerializer(CleanerProperties props) {
|
||||
this(props, DEFAULT_INDENTATION_STRING);
|
||||
}
|
||||
|
||||
public PrettyXmlSerializer(CleanerProperties props, String indentString) {
|
||||
super(props);
|
||||
this.indentString = indentString;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializePrettyXml(tagNode, writer, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param level
|
||||
* @return Appropriate indentation for the specified depth.
|
||||
*/
|
||||
private synchronized String getIndent(int level) {
|
||||
int size = indents.size();
|
||||
if (size <= level) {
|
||||
String prevIndent = size == 0 ? null : indents.get(size - 1);
|
||||
for (int i = size; i <= level; i++) {
|
||||
String currIndent = prevIndent == null ? "" : prevIndent + indentString;
|
||||
indents.add(currIndent);
|
||||
prevIndent = currIndent;
|
||||
}
|
||||
}
|
||||
|
||||
return indents.get(level);
|
||||
}
|
||||
|
||||
private String getIndentedText(String content, int level) {
|
||||
String indent = getIndent(level);
|
||||
StringBuilder result = new StringBuilder( content.length() );
|
||||
StringTokenizer tokenizer = new StringTokenizer(content, "\n\r");
|
||||
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String line = tokenizer.nextToken().trim();
|
||||
if (!"".equals(line)) {
|
||||
result.append(indent).append(line).append("\n");
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private String getSingleLineOfChildren(List<? extends BaseToken> children) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
Iterator<? extends BaseToken> childrenIt = children.iterator();
|
||||
boolean isFirst = true;
|
||||
|
||||
while (childrenIt.hasNext()) {
|
||||
Object child = childrenIt.next();
|
||||
|
||||
if ( !(child instanceof ContentNode) ) {
|
||||
return null;
|
||||
} else {
|
||||
String content = child.toString();
|
||||
|
||||
// if first item trims it from left
|
||||
if (isFirst) {
|
||||
content = ltrim(content);
|
||||
}
|
||||
|
||||
// if last item trims it from right
|
||||
if (!childrenIt.hasNext()) {
|
||||
content = rtrim(content);
|
||||
}
|
||||
|
||||
if ( content.indexOf("\n") >= 0 || content.indexOf("\r") >= 0 ) {
|
||||
return null;
|
||||
}
|
||||
result.append(content);
|
||||
}
|
||||
|
||||
isFirst = false;
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
protected void serializePrettyXml(TagNode tagNode, Writer writer, int level) throws IOException {
|
||||
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
|
||||
boolean isHeadlessNode = Utils.isEmptyString(tagNode.getName());
|
||||
String indent = isHeadlessNode ? "" : getIndent(level);
|
||||
|
||||
writer.write(indent);
|
||||
serializeOpenTag(tagNode, writer, true);
|
||||
|
||||
if ( !isMinimizedTagSyntax(tagNode) ) {
|
||||
String singleLine = getSingleLineOfChildren(tagChildren);
|
||||
boolean dontEscape = dontEscape(tagNode);
|
||||
if (singleLine != null) {
|
||||
if ( !dontEscape(tagNode) ) {
|
||||
writer.write( escapeXml(singleLine) );
|
||||
} else {
|
||||
writer.write( singleLine.replaceAll("]]>", "]]>") );
|
||||
}
|
||||
} else {
|
||||
if (!isHeadlessNode) {
|
||||
writer.write("\n");
|
||||
}
|
||||
for (Object child: tagChildren) {
|
||||
if (child instanceof TagNode) {
|
||||
serializePrettyXml( (TagNode)child, writer, isHeadlessNode ? level : level + 1 );
|
||||
} else if (child instanceof CData){
|
||||
serializeCData((CData)child, tagNode, writer);
|
||||
} else if (child instanceof ContentNode) {
|
||||
String content = dontEscape ? child.toString().replaceAll("]]>", "]]>") : escapeXml(child.toString());
|
||||
writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) );
|
||||
} else if (child instanceof CommentNode) {
|
||||
CommentNode commentNode = (CommentNode) child;
|
||||
String content = commentNode.getCommentedContent();
|
||||
writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (singleLine == null) {
|
||||
writer.write(indent);
|
||||
}
|
||||
|
||||
serializeEndTag(tagNode, writer, true);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Trims specified string from left.
|
||||
* @param s
|
||||
*/
|
||||
private String ltrim(String s) {
|
||||
if (s == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
int len = s.length();
|
||||
|
||||
while ( index < len && Character.isWhitespace(s.charAt(index)) ) {
|
||||
index++;
|
||||
}
|
||||
|
||||
return (index >= len) ? "" : s.substring(index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims specified string from right.
|
||||
* @param s
|
||||
*/
|
||||
private String rtrim(String s) {
|
||||
if (s == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int len = s.length();
|
||||
int index = len;
|
||||
|
||||
while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) {
|
||||
index--;
|
||||
}
|
||||
|
||||
return (index <= 0) ? "" : s.substring(0, index);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* A {@link TagNode} that only really holds whitespace or comments - allows
|
||||
* using {@link ContentNode} in places where a {@link TagNode} is expected.
|
||||
* <p/>
|
||||
* This class is currently just a short-lived intermediate artifact generated
|
||||
* from {@link HtmlCleaner} while cleaning an html file and descarded
|
||||
* before the results are returned.
|
||||
*
|
||||
* @author andyhot
|
||||
*/
|
||||
class ProxyTagNode extends TagNode {
|
||||
private ContentNode token;
|
||||
private CommentNode comment;
|
||||
private TagNode bodyNode;
|
||||
|
||||
public ProxyTagNode(ContentNode token, TagNode bodyNode) {
|
||||
super("");
|
||||
this.token = token;
|
||||
this.bodyNode = bodyNode;
|
||||
}
|
||||
|
||||
public ProxyTagNode(CommentNode comment, TagNode bodyNode) {
|
||||
super("");
|
||||
this.comment = comment;
|
||||
this.bodyNode = bodyNode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TagNode getParent() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean removeFromTree() {
|
||||
bodyNode.removeChild(getToken());
|
||||
return true;
|
||||
}
|
||||
|
||||
public BaseToken getToken() {
|
||||
return token!=null ? token : comment;
|
||||
}
|
||||
|
||||
public String getContent() {
|
||||
return token!=null ? token.getContent() : comment.getContent();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,273 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Basic abstract serializer - contains common logic for descendants (methods <code>writeXXX()</code>.</p>
|
||||
*/
|
||||
public abstract class Serializer {
|
||||
|
||||
/**
|
||||
* Used to implement serialization with missing envelope - omiting open and close tags, just
|
||||
* serialize children.
|
||||
*/
|
||||
private class HeadlessTagNode extends TagNode {
|
||||
private HeadlessTagNode(TagNode wrappedNode) {
|
||||
super("");
|
||||
getAttributes().putAll(wrappedNode.getAttributes());
|
||||
addChildren(wrappedNode.getAllChildren());
|
||||
setDocType(wrappedNode.getDocType());
|
||||
Map<String, String> nsDecls = getNamespaceDeclarations();
|
||||
if (nsDecls != null) {
|
||||
Map<String, String> wrappedNSDecls = wrappedNode.getNamespaceDeclarations();
|
||||
if (wrappedNSDecls != null) {
|
||||
nsDecls.putAll(wrappedNSDecls);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
protected CleanerProperties props;
|
||||
|
||||
protected Serializer(CleanerProperties props) {
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the output stream, using specified charset and optionally omits node envelope
|
||||
* (skips open and close tags of the node).
|
||||
* @param tagNode Node to be written
|
||||
* @param out Output stream
|
||||
* @param charset Charset of the output
|
||||
* @param omitEnvelope Tells whether to skip open and close tag of the node.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToStream(TagNode tagNode, OutputStream out, String charset, boolean omitEnvelope) throws IOException {
|
||||
write( tagNode, new OutputStreamWriter(out, charset), charset, omitEnvelope );
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the output stream, using specified charset.
|
||||
* @param tagNode Node to be written
|
||||
* @param out Output stream
|
||||
* @param charset Charset of the output
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToStream(TagNode tagNode, OutputStream out, String charset) throws IOException {
|
||||
writeToStream(tagNode, out, charset, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the output stream, using system default charset and optionally omits node envelope
|
||||
* (skips open and close tags of the node).
|
||||
* @param tagNode Node to be written
|
||||
* @param out Output stream
|
||||
* @param omitEnvelope Tells whether to skip open and close tag of the node.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToStream(TagNode tagNode, OutputStream out, boolean omitEnvelope) throws IOException {
|
||||
writeToStream( tagNode, out, props.getCharset(), omitEnvelope );
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the output stream, using system default charset.
|
||||
* @param tagNode Node to be written
|
||||
* @param out Output stream
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToStream(TagNode tagNode, OutputStream out) throws IOException {
|
||||
writeToStream(tagNode, out, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the file, using specified charset and optionally omits node envelope
|
||||
* (skips open and close tags of the node).
|
||||
* @param tagNode Node to be written
|
||||
* @param fileName Output file name
|
||||
* @param charset Charset of the output
|
||||
* @param omitEnvelope Tells whether to skip open and close tag of the node.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToFile(TagNode tagNode, String fileName, String charset, boolean omitEnvelope) throws IOException {
|
||||
writeToStream(tagNode, new FileOutputStream(fileName), charset, omitEnvelope );
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the file, using specified charset.
|
||||
* @param tagNode Node to be written
|
||||
* @param fileName Output file name
|
||||
* @param charset Charset of the output
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToFile(TagNode tagNode, String fileName, String charset) throws IOException {
|
||||
writeToFile(tagNode, fileName, charset, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the file, using specified charset and optionally omits node envelope
|
||||
* (skips open and close tags of the node).
|
||||
* @param tagNode Node to be written
|
||||
* @param fileName Output file name
|
||||
* @param omitEnvelope Tells whether to skip open and close tag of the node.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToFile(TagNode tagNode, String fileName, boolean omitEnvelope) throws IOException {
|
||||
writeToFile(tagNode,fileName, props.getCharset(), omitEnvelope);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified TagNode to the file, using system default charset.
|
||||
* @param tagNode Node to be written
|
||||
* @param fileName Output file name
|
||||
* @throws IOException
|
||||
*/
|
||||
public void writeToFile(TagNode tagNode, String fileName) throws IOException {
|
||||
writeToFile(tagNode, fileName, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagNode Node to serialize to string
|
||||
* @param charset Charset of the output - stands in xml declaration part
|
||||
* @param omitEnvelope Tells whether to skip open and close tag of the node.
|
||||
* @return Output as string
|
||||
*/
|
||||
public String getAsString(TagNode tagNode, String charset, boolean omitEnvelope) {
|
||||
StringWriter writer = new StringWriter();
|
||||
try {
|
||||
write(tagNode, writer, charset, omitEnvelope);
|
||||
} catch (IOException e) {
|
||||
// not writing to the file system so any io errors should be really rare ( and bad)
|
||||
throw new HtmlCleanerException(e);
|
||||
}
|
||||
return writer.getBuffer().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagNode Node to serialize to string
|
||||
* @param charset Charset of the output - stands in xml declaration part
|
||||
* @return Output as string
|
||||
*/
|
||||
public String getAsString(TagNode tagNode, String charset) {
|
||||
return getAsString(tagNode, charset, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagNode Node to serialize to string
|
||||
* @param omitEnvelope Tells whether to skip open and close tag of the node.
|
||||
* @return Output as string
|
||||
* @throws IOException
|
||||
*/
|
||||
public String getAsString(TagNode tagNode, boolean omitEnvelope) {
|
||||
return getAsString(tagNode, props.getCharset(), omitEnvelope);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagNode Node to serialize to string
|
||||
* @return Output as string
|
||||
* @throws IOException
|
||||
*/
|
||||
public String getAsString(TagNode tagNode) {
|
||||
return getAsString(tagNode, false);
|
||||
}
|
||||
|
||||
public String getAsString(String htmlContent) {
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner(this.props);
|
||||
TagNode tagNode = htmlCleaner.clean(htmlContent);
|
||||
return getAsString(tagNode, props.getCharset());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Writes specified node using specified writer.
|
||||
* @param tagNode Node to serialize.
|
||||
* @param writer Writer instance
|
||||
* @param charset Charset of the output
|
||||
* @throws IOException
|
||||
*/
|
||||
public void write(TagNode tagNode, Writer writer, String charset) throws IOException {
|
||||
write(tagNode, writer, charset, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes specified node using specified writer.
|
||||
* @param tagNode Node to serialize.
|
||||
* @param writer Writer instance
|
||||
* @param charset Charset of the output
|
||||
* @param omitEnvelope Tells whether to skip open and close tag of the node.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void write(TagNode tagNode, Writer writer, String charset, boolean omitEnvelope) throws IOException {
|
||||
if (omitEnvelope) {
|
||||
tagNode = new HeadlessTagNode(tagNode);
|
||||
}
|
||||
writer = new BufferedWriter(writer);
|
||||
if ( !props.isOmitXmlDeclaration() ) {
|
||||
String declaration = "<?xml version=\"1.0\"";
|
||||
if (charset != null) {
|
||||
declaration += " encoding=\"" + charset + "\"";
|
||||
}
|
||||
declaration += "?>";
|
||||
writer.write(declaration + "\n");
|
||||
}
|
||||
|
||||
if ( !props.isOmitDoctypeDeclaration() ) {
|
||||
DoctypeToken doctypeToken = tagNode.getDocType();
|
||||
if ( doctypeToken != null ) {
|
||||
doctypeToken.serialize(this, writer);
|
||||
}
|
||||
}
|
||||
|
||||
serialize(tagNode, writer);
|
||||
|
||||
writer.flush();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
||||
protected boolean isScriptOrStyle(TagNode tagNode) {
|
||||
String tagName = tagNode.getName();
|
||||
return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
|
||||
}
|
||||
|
||||
protected abstract void serialize(TagNode tagNode, Writer writer) throws IOException;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* <p>Simple HTML serializer - creates resulting HTML without indenting and/or compacting.</p>
|
||||
*/
|
||||
public class SimpleHtmlSerializer extends HtmlSerializer {
|
||||
|
||||
boolean escape = true;
|
||||
|
||||
public SimpleHtmlSerializer(CleanerProperties props, boolean escape) {
|
||||
super(props);
|
||||
this.escape = escape;
|
||||
}
|
||||
|
||||
public SimpleHtmlSerializer(CleanerProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializeOpenTag(tagNode, writer, false);
|
||||
|
||||
if ( !isMinimizedTagSyntax(tagNode) ) {
|
||||
for (Object item: tagNode.getAllChildren()) {
|
||||
if ( item instanceof ContentNode) {
|
||||
String content = item.toString();
|
||||
writer.write( dontEscape(tagNode) || !escape ? content : escapeText(content) );
|
||||
} else if (item instanceof BaseToken) {
|
||||
((BaseToken)item).serialize(this, writer);
|
||||
}
|
||||
}
|
||||
|
||||
serializeEndTag(tagNode, writer, false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,79 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* <p>Simple XML serializer - creates resulting XML without indenting lines.</p>
|
||||
*/
|
||||
public class SimpleXmlSerializer extends XmlSerializer {
|
||||
|
||||
public SimpleXmlSerializer(CleanerProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializeOpenTag(tagNode, writer, false);
|
||||
|
||||
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
|
||||
if ( !isMinimizedTagSyntax(tagNode) ) {
|
||||
Iterator<? extends BaseToken> childrenIt = tagChildren.iterator();
|
||||
while ( childrenIt.hasNext() ) {
|
||||
Object item = childrenIt.next();
|
||||
|
||||
if (item != null) {
|
||||
if (item instanceof CData) {
|
||||
serializeCData((CData)item, tagNode, writer);
|
||||
} else if ( item instanceof ContentNode ) {
|
||||
serializeContentToken((ContentNode)item, tagNode, writer);
|
||||
} else {
|
||||
((BaseToken)item).serialize(this, writer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
serializeEndTag(tagNode, writer, false);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,495 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* <p>This class contains map with special entities used in HTML and their
|
||||
* unicodes.</p>
|
||||
*
|
||||
* Created by: Vladimir Nikic<br/>
|
||||
* Date: November, 2006.
|
||||
*/
|
||||
public class SpecialEntities {
|
||||
|
||||
public static final SpecialEntities INSTANCE = new SpecialEntities(true, true) {
|
||||
@Override
|
||||
public void put(SpecialEntity specialEntity) {
|
||||
throw new UnsupportedOperationException("cannot add to this instance");
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* key is the {@link SpecialEntity#getKey()} ( i.e. "quot" )
|
||||
*/
|
||||
private Map<String, SpecialEntity> entities = new HashMap<String, SpecialEntity>();
|
||||
/**
|
||||
* Key is the Integer returned by {@link SpecialEntity#intValue()}
|
||||
*/
|
||||
private Map<Integer, SpecialEntity> entitiesByUnicodeCharcode = new HashMap<Integer, SpecialEntity>();
|
||||
private boolean greek;
|
||||
private boolean math;
|
||||
private int maxEntityLength;
|
||||
public static final char NON_BREAKABLE_SPACE = 160;
|
||||
|
||||
public SpecialEntities(boolean greek, boolean math) {
|
||||
this.greek = greek;
|
||||
this.math = math;
|
||||
_put(new SpecialEntity("null", 0, "", true));
|
||||
_put(new SpecialEntity("nbsp", NON_BREAKABLE_SPACE, null, true));
|
||||
_put(new SpecialEntity("iexcl", 161, null, true));
|
||||
_put(new SpecialEntity("cent", 162, null, true));
|
||||
_put(new SpecialEntity("pound", 163, null, true));
|
||||
_put(new SpecialEntity("curren", 164, null, true));
|
||||
_put(new SpecialEntity("yen", 165, null, true));
|
||||
_put(new SpecialEntity("brvbar", 166, null, true));
|
||||
_put(new SpecialEntity("sect", 167, null, true));
|
||||
_put(new SpecialEntity("uml", 168, null, true));
|
||||
_put(new SpecialEntity("copy", 169, null, true));
|
||||
_put(new SpecialEntity("ordf", 170, null, true));
|
||||
_put(new SpecialEntity("laquo", 171, null, true));
|
||||
_put(new SpecialEntity("not", 172, null, true));
|
||||
_put(new SpecialEntity("shy", 173, null, true));
|
||||
_put(new SpecialEntity("reg", 174, null, true));
|
||||
_put(new SpecialEntity("macr", 175, null, true));
|
||||
_put(new SpecialEntity("deg", 176, null, true));
|
||||
_put(new SpecialEntity("plusmn", 177, null, true));
|
||||
_put(new SpecialEntity("sup2", 178, null, true));
|
||||
_put(new SpecialEntity("sup3", 179, null, true));
|
||||
_put(new SpecialEntity("acute", 180, null, true));
|
||||
_put(new SpecialEntity("micro", 181, null, true));
|
||||
_put(new SpecialEntity("para", 182, null, true));
|
||||
_put(new SpecialEntity("middot", 183, null, true));
|
||||
_put(new SpecialEntity("cedil", 184, null, true));
|
||||
_put(new SpecialEntity("sup1", 185, null, true));
|
||||
_put(new SpecialEntity("ordm", 186, null, true));
|
||||
_put(new SpecialEntity("raquo", 187, null, true));
|
||||
_put(new SpecialEntity("frac14", 188, null, true));
|
||||
_put(new SpecialEntity("frac12", 189, null, true));
|
||||
_put(new SpecialEntity("frac34", 190, null, true));
|
||||
_put(new SpecialEntity("iquest", 191, null, true));
|
||||
_put(new SpecialEntity("Agrave", 192, null, true));
|
||||
_put(new SpecialEntity("Aacute", 193, null, true));
|
||||
_put(new SpecialEntity("Acirc", 194, null, true));
|
||||
_put(new SpecialEntity("Atilde", 195, null, true));
|
||||
|
||||
_put(new SpecialEntity("Auml", 196, null, true));
|
||||
_put(new SpecialEntity("Aring", 197, null, true));
|
||||
_put(new SpecialEntity("AElig", 198, null, true));
|
||||
_put(new SpecialEntity("Ccedil", 199, null, true));
|
||||
_put(new SpecialEntity("Egrave", 200, null, true));
|
||||
_put(new SpecialEntity("Eacute", 201, null, true));
|
||||
_put(new SpecialEntity("Ecirc", 202, null, true));
|
||||
_put(new SpecialEntity("Euml", 203, null, true));
|
||||
_put(new SpecialEntity("Igrave", 204, null, true));
|
||||
_put(new SpecialEntity("Iacute", 205, null, true));
|
||||
_put(new SpecialEntity("Icirc", 206, null, true));
|
||||
_put(new SpecialEntity("Iuml", 207, null, true));
|
||||
_put(new SpecialEntity("ETH", 208, null, true));
|
||||
_put(new SpecialEntity("Ntilde", 209, null, true));
|
||||
_put(new SpecialEntity("Ograve", 210, null, true));
|
||||
_put(new SpecialEntity("Oacute", 211, null, true));
|
||||
_put(new SpecialEntity("Ocirc", 212, null, true));
|
||||
_put(new SpecialEntity("Otilde", 213, null, true));
|
||||
_put(new SpecialEntity("Ouml", 214, null, true));
|
||||
_put(new SpecialEntity("times", 215, null, true));
|
||||
_put(new SpecialEntity("Oslash", 216, null, true));
|
||||
_put(new SpecialEntity("Ugrave", 217, null, true));
|
||||
_put(new SpecialEntity("Uacute", 218, null, true));
|
||||
_put(new SpecialEntity("Ucirc", 219, null, true));
|
||||
_put(new SpecialEntity("Uuml", 220, null, true));
|
||||
_put(new SpecialEntity("Yacute", 221, null, true));
|
||||
_put(new SpecialEntity("THORN", 222, null, true));
|
||||
_put(new SpecialEntity("szlig", 223, null, true));
|
||||
_put(new SpecialEntity("agrave", 224, null, true));
|
||||
_put(new SpecialEntity("aacute", 225, null, true));
|
||||
_put(new SpecialEntity("acirc", 226, null, true));
|
||||
_put(new SpecialEntity("atilde", 227, null, true));
|
||||
_put(new SpecialEntity("auml", 228, null, true));
|
||||
_put(new SpecialEntity("aring", 229, null, true));
|
||||
_put(new SpecialEntity("aelig", 230, null, true));
|
||||
_put(new SpecialEntity("ccedil", 231, null, true));
|
||||
_put(new SpecialEntity("egrave", 232, null, true));
|
||||
_put(new SpecialEntity("eacute", 233, null, true));
|
||||
_put(new SpecialEntity("ecirc", 234, null, true));
|
||||
_put(new SpecialEntity("euml", 235, null, true));
|
||||
_put(new SpecialEntity("igrave", 236, null, true));
|
||||
_put(new SpecialEntity("iacute", 237, null, true));
|
||||
_put(new SpecialEntity("icirc", 238, null, true));
|
||||
_put(new SpecialEntity("iuml", 239, null, true));
|
||||
_put(new SpecialEntity("eth", 240, null, true));
|
||||
_put(new SpecialEntity("ntilde", 241, null, true));
|
||||
_put(new SpecialEntity("ograve", 242, null, true));
|
||||
_put(new SpecialEntity("oacute", 243, null, true));
|
||||
_put(new SpecialEntity("ocirc", 244, null, true));
|
||||
_put(new SpecialEntity("otilde", 245, null, true));
|
||||
_put(new SpecialEntity("ouml", 246, null, true));
|
||||
_put(new SpecialEntity("divide", 247, null, true));
|
||||
_put(new SpecialEntity("oslash", 248, null, true));
|
||||
_put(new SpecialEntity("ugrave", 249, null, true));
|
||||
_put(new SpecialEntity("uacute", 250, null, true));
|
||||
_put(new SpecialEntity("ucirc", 251, null, true));
|
||||
_put(new SpecialEntity("uuml", 252, null, true));
|
||||
_put(new SpecialEntity("yacute", 253, null, true));
|
||||
_put(new SpecialEntity("thorn", 254, null, true));
|
||||
_put(new SpecialEntity("yuml", 255, null, true));
|
||||
|
||||
_put(new SpecialEntity("OElig", 338, null, true));
|
||||
_put(new SpecialEntity("oelig", 339, null, true));
|
||||
_put(new SpecialEntity("Scaron", 352, null, true));
|
||||
_put(new SpecialEntity("scaron", 353, null, true));
|
||||
_put(new SpecialEntity("Yuml", 376, null, true));
|
||||
_put(new SpecialEntity("fnof", 402, null, true));
|
||||
_put(new SpecialEntity("circ", 710, null, true));
|
||||
_put(new SpecialEntity("tilde", 732, null, true));
|
||||
if ( this.greek ) {
|
||||
// 913 Alpha Α greek capital letter alpha
|
||||
_put(new SpecialEntity("Alpha", 913, null, true));
|
||||
// 914 Beta Β greek capital letter beta
|
||||
_put(new SpecialEntity("Beta", 914, null, true));
|
||||
// 915 Gamma Γ greek capital letter gamma
|
||||
_put(new SpecialEntity("Gamma", 915, null, true));
|
||||
// 916 Delta Δ greek capital letter delta
|
||||
_put(new SpecialEntity("Delta", 916, null, true));
|
||||
// 917 Epsilon Ε greek capital letter epsilon
|
||||
_put(new SpecialEntity("Epsilon", 917, null, true));
|
||||
// 918 Zeta Ζ greek capital letter zeta
|
||||
_put(new SpecialEntity("Zeta", 918, null, true));
|
||||
// 919 Eta Η greek capital letter eta
|
||||
_put(new SpecialEntity("Eta", 919, null, true));
|
||||
// 920 Theta Θ greek capital letter theta
|
||||
_put(new SpecialEntity("Theta", 920, null, true));
|
||||
// 921 Iota Ι greek capital letter iota
|
||||
_put(new SpecialEntity("Iota", 921, null, true));
|
||||
// 922 Kappa Κ greek capital letter kappa
|
||||
_put(new SpecialEntity("Kappa", 922, null, true));
|
||||
// 923 Lambda Λ greek capital letter lambda
|
||||
_put(new SpecialEntity("Lambda", 923, null, true));
|
||||
// 924 Mu Μ greek capital letter mu
|
||||
_put(new SpecialEntity("Mu", 924, null, true));
|
||||
// 925 Nu Ν greek capital letter nu
|
||||
_put(new SpecialEntity("Nu", 925, null, true));
|
||||
// 926 Xi Ξ greek capital letter xi
|
||||
_put(new SpecialEntity("Xi", 926, null, true));
|
||||
// 927 Omicron Ο greek capital letter omicron
|
||||
_put(new SpecialEntity("Omicron", 927, null, true));
|
||||
// 928 Pi Π greek capital letter pi
|
||||
_put(new SpecialEntity("Pi", 928, null, true));
|
||||
// 929 Rho Ρ greek capital letter rho
|
||||
_put(new SpecialEntity("Rho", 929, null, true));
|
||||
// there is no Sigmaf, and no U+03A2 character either
|
||||
// 931 Sigma Σ greek capital letter sigma
|
||||
_put(new SpecialEntity("Sigma", 931, null, true));
|
||||
// 932 Tau Τ greek capital letter tau
|
||||
_put(new SpecialEntity("Tau", 932, null, true));
|
||||
// 933 Upsilon Υ greek capital letter upsilon
|
||||
_put(new SpecialEntity("Upsilon", 933, null, true));
|
||||
// 934 Phi Φ greek capital letter phi
|
||||
_put(new SpecialEntity("Phi", 934, null, true));
|
||||
// 935 Chi Χ greek capital letter chi
|
||||
_put(new SpecialEntity("Chi", 935, null, true));
|
||||
// 936 Psi Ψ greek capital letter psi
|
||||
_put(new SpecialEntity("Psi", 936, null, true));
|
||||
// 937 Omega Ω greek capital letter omega
|
||||
_put(new SpecialEntity("Omega", 937, null, true));
|
||||
// 945 alpha α greek small letter alpha
|
||||
_put(new SpecialEntity("alpha", 945, null, true));
|
||||
// 946 beta β greek small letter beta
|
||||
_put(new SpecialEntity("beta", 946, null, true));
|
||||
// 947 gamma γ greek small letter gamma
|
||||
_put(new SpecialEntity("gamma", 947, null, true));
|
||||
// 948 delta δ greek small letter delta
|
||||
_put(new SpecialEntity("delta", 948, null, true));
|
||||
// 949 epsilon ε greek small letter epsilon
|
||||
_put(new SpecialEntity("epsilon", 949, null, true));
|
||||
// 950 zeta ζ greek small letter zeta
|
||||
_put(new SpecialEntity("zeta", 950, null, true));
|
||||
// 951 eta η greek small letter eta
|
||||
_put(new SpecialEntity("eta", 951, null, true));
|
||||
// 952 theta θ greek small letter theta
|
||||
_put(new SpecialEntity("theta", 952, null, true));
|
||||
// 953 iota ι greek small letter iota
|
||||
_put(new SpecialEntity("iota", 953, null, true));
|
||||
// 954 kappa κ greek small letter kappa
|
||||
_put(new SpecialEntity("kappa", 954, null, true));
|
||||
// 955 lambda λ greek small letter lambda
|
||||
_put(new SpecialEntity("lambda", 955, null, true));
|
||||
// 956 mu μ greek small letter mu
|
||||
_put(new SpecialEntity("mu", 956, null, true));
|
||||
// 957 nu ν greek small letter nu
|
||||
_put(new SpecialEntity("nu", 957, null, true));
|
||||
// 958 xi ξ greek small letter xi
|
||||
_put(new SpecialEntity("xi", 958, null, true));
|
||||
// 959 omicron ο greek small letter omicron
|
||||
_put(new SpecialEntity("omicron", 959, null, true));
|
||||
// 960 pi π greek small letter pi
|
||||
_put(new SpecialEntity("pi", 960, null, true));
|
||||
// 961 rho ρ greek small letter rho
|
||||
_put(new SpecialEntity("rho", 961, null, true));
|
||||
// 962 sigmaf ς greek small letter final sigma
|
||||
_put(new SpecialEntity("sigmaf", 962, null, true));
|
||||
// 963 sigma σ greek small letter sigma
|
||||
_put(new SpecialEntity("sigma", 963, null, true));
|
||||
// 964 tau τ greek small letter tau
|
||||
_put(new SpecialEntity("tau", 964, null, true));
|
||||
// 965 upsilon υ greek small letter upsilon
|
||||
_put(new SpecialEntity("upsilon", 965, null, true));
|
||||
// 966 phi φ greek small letter phi
|
||||
_put(new SpecialEntity("phi", 966, null, true));
|
||||
// 967 chi χ greek small letter chi
|
||||
_put(new SpecialEntity("chi", 967, null, true));
|
||||
// 968 psi ψ greek small letter psi
|
||||
_put(new SpecialEntity("psi", 968, null, true));
|
||||
// 969 omega ω greek small letter omega
|
||||
_put(new SpecialEntity("omega", 969, null, true));
|
||||
// 977 thetasym ϑ greek small letter theta symbol
|
||||
_put(new SpecialEntity("thetasym", 977, null, true));
|
||||
// 978 upsih ϒ greek upsilon with hook symbol
|
||||
_put(new SpecialEntity("upsih", 978, null, true));
|
||||
// 982 piv ϖ greek pi symbol
|
||||
_put(new SpecialEntity("piv", 982, null, true));
|
||||
}
|
||||
_put(new SpecialEntity("ensp", 8194, null, true));
|
||||
_put(new SpecialEntity("emsp", 8195, null, true));
|
||||
_put(new SpecialEntity("thinsp", 8201, null, true));
|
||||
_put(new SpecialEntity("zwnj", 8204, null, true));
|
||||
_put(new SpecialEntity("zwj", 8205, null, true));
|
||||
_put(new SpecialEntity("lrm", 8206, null, true));
|
||||
_put(new SpecialEntity("rlm", 8207, null, true));
|
||||
_put(new SpecialEntity("ndash", 8211, null, true));
|
||||
_put(new SpecialEntity("mdash", 8212, null, true));
|
||||
_put(new SpecialEntity("lsquo", 8216, null, true));
|
||||
_put(new SpecialEntity("rsquo", 8217, null, true));
|
||||
_put(new SpecialEntity("sbquo", 8218, null, true));
|
||||
_put(new SpecialEntity("ldquo", 8220, null, true));
|
||||
_put(new SpecialEntity("rdquo", 8221, null, true));
|
||||
_put(new SpecialEntity("bdquo", 8222, null, true));
|
||||
_put(new SpecialEntity("dagger", 8224, null, true));
|
||||
_put(new SpecialEntity("Dagger", 8225, null, true));
|
||||
_put(new SpecialEntity("bull", 8226, null, true));
|
||||
// three ellipses
|
||||
_put(new SpecialEntity("hellip", 8230, null, true));
|
||||
_put(new SpecialEntity("permil", 8240, null, true));
|
||||
_put(new SpecialEntity("prime", 8242, null, true));
|
||||
_put(new SpecialEntity("Prime", 8243, null, true));
|
||||
_put(new SpecialEntity("lsaquo", 8249, null, true));
|
||||
_put(new SpecialEntity("rsaquo", 8250, null, true));
|
||||
_put(new SpecialEntity("oline", 8254, null, true));
|
||||
_put(new SpecialEntity("frasl", 8260, null, true));
|
||||
_put(new SpecialEntity("euro", 8364, null, true));
|
||||
_put(new SpecialEntity("image", 8465, null, true));
|
||||
_put(new SpecialEntity("weierp", 8472, null, true));
|
||||
_put(new SpecialEntity("real", 8476, null, true));
|
||||
_put(new SpecialEntity("trade", 8482, null, true));
|
||||
_put(new SpecialEntity("alefsym", 8501, null, true));
|
||||
_put(new SpecialEntity("larr", 8592, null, true));
|
||||
_put(new SpecialEntity("uarr", 8593, null, true));
|
||||
_put(new SpecialEntity("rarr", 8594, null, true));
|
||||
_put(new SpecialEntity("darr", 8595, null, true));
|
||||
_put(new SpecialEntity("harr", 8596, null, true));
|
||||
_put(new SpecialEntity("crarr", 8629, null, true));
|
||||
_put(new SpecialEntity("lArr", 8656, null, true));
|
||||
_put(new SpecialEntity("uArr", 8657, null, true));
|
||||
_put(new SpecialEntity("rArr", 8658, null, true));
|
||||
_put(new SpecialEntity("dArr", 8659, null, true));
|
||||
_put(new SpecialEntity("hArr", 8660, null, true));
|
||||
if (this.math) {
|
||||
// 8704 forall ∀ for all
|
||||
_put(new SpecialEntity("forall", 8704, null, true));
|
||||
//8706 part ∂ partial differential
|
||||
_put(new SpecialEntity("part", 8706, null, true));
|
||||
//8707 exist ∃ there exists
|
||||
_put(new SpecialEntity("exist", 8707, null, true));
|
||||
//8709 empty ∅ empty set = null set = diameter
|
||||
_put(new SpecialEntity("empty", 8709, null, true));
|
||||
//8711 nabla ∇ nabla = backward difference
|
||||
_put(new SpecialEntity("nabla", 8711, null, true));
|
||||
//8712 isin ∈ element of
|
||||
_put(new SpecialEntity("isin", 8712, null, true));
|
||||
//8713 notin ∉ not an element of
|
||||
_put(new SpecialEntity("notin", 8713, null, true));
|
||||
//8715 ni ∋ contains as member
|
||||
_put(new SpecialEntity("ni", 8715, null, true));
|
||||
//8719 prod ∏ n-ary product = product sign
|
||||
//prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both
|
||||
_put(new SpecialEntity("prod", 8719, null, true));
|
||||
//8721 sum ∑ n-ary sumation
|
||||
//sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both
|
||||
_put(new SpecialEntity("sum", 8721, null, true));
|
||||
//8722 minus − minus sign
|
||||
_put(new SpecialEntity("minus", 8722, null, true));
|
||||
//8727 lowast ∗ asterisk operator
|
||||
_put(new SpecialEntity("lowast", 8727, null, true));
|
||||
//8730 radic √ square root = radical sign
|
||||
_put(new SpecialEntity("radic", 8730, null, true));
|
||||
//8733 prop ∝ proportional to
|
||||
_put(new SpecialEntity("prop", 8733, null, true));
|
||||
//8734 infin ∞ infinity
|
||||
_put(new SpecialEntity("infin", 8734, null, true));
|
||||
//8736 ang ∠ angle
|
||||
_put(new SpecialEntity("ang", 8736, null, true));
|
||||
//8743 and ∧ logical and = wedge
|
||||
_put(new SpecialEntity("and", 8743, null, true));
|
||||
//8744 or ∨ logical or = vee
|
||||
_put(new SpecialEntity("or", 8744, null, true));
|
||||
//8745 cap ∩ intersection = cap
|
||||
_put(new SpecialEntity("cap", 8745, null, true));
|
||||
//8746 cup ∪ union = cup
|
||||
_put(new SpecialEntity("cup", 8746, null, true));
|
||||
//8747 int ∫ integral
|
||||
_put(new SpecialEntity("int", 8747, null, true));
|
||||
//8756 there4 ∴ therefore
|
||||
_put(new SpecialEntity("there4", 8756, null, true));
|
||||
//8764 sim ∼ tilde operator = varies with = similar to
|
||||
//tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both
|
||||
_put(new SpecialEntity("sim", 8764, null, true));
|
||||
//8773 cong ≅ approximately equal to
|
||||
_put(new SpecialEntity("cong", 8773, null, true));
|
||||
//8776 asymp ≈ almost equal to = asymptotic to
|
||||
_put(new SpecialEntity("asymp", 8776, null, true));
|
||||
//8800 ne ≠ not equal to
|
||||
_put(new SpecialEntity("ne", 8800, null, true));
|
||||
//8801 equiv ≡ identical to
|
||||
_put(new SpecialEntity("equiv", 8801, null, true));
|
||||
//8804 le ≤ less-than or equal to
|
||||
_put(new SpecialEntity("le", 8804, null, true));
|
||||
//8805 ge ≥ greater-than or equal to
|
||||
_put(new SpecialEntity("ge", 8805, null, true));
|
||||
//8834 sub ⊂ subset of
|
||||
_put(new SpecialEntity("sub", 8834, null, true));
|
||||
//8835 sup ⊃ superset of
|
||||
_put(new SpecialEntity("sup", 8835, null, true));
|
||||
//note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry? It is in ISOamsn
|
||||
//8836 nsub ⊄ not a subset of
|
||||
_put(new SpecialEntity("nsub", 8836, null, true));
|
||||
//8838 sube ⊆ subset of or equal to
|
||||
_put(new SpecialEntity("sube", 8838, null, true));
|
||||
//8839 supe ⊇ superset of or equal to
|
||||
_put(new SpecialEntity("supe", 8839, null, true));
|
||||
//8853 oplus ⊕ circled plus = direct sum
|
||||
_put(new SpecialEntity("oplus", 8853, null, true));
|
||||
//8855 otimes ⊗ circled times = vector product
|
||||
_put(new SpecialEntity("otimes", 8855, null, true));
|
||||
//8869 perp ⊥ up tack = orthogonal to = perpendicular
|
||||
_put(new SpecialEntity("perp", 8869, null, true));
|
||||
//8901 sdot ⋅ dot operator
|
||||
_put(new SpecialEntity("sdot", 8901, null, true));
|
||||
//dot operator is NOT the same character as U+00B7 middle dot
|
||||
//8968 lceil ⌈ left ceiling = apl upstile
|
||||
_put(new SpecialEntity("lceil", 8968, null, true));
|
||||
//8969 rceil ⌉ right ceiling
|
||||
_put(new SpecialEntity("rceil", 8969, null, true));
|
||||
//8970 lfloor ⌊ left floor = apl downstile
|
||||
_put(new SpecialEntity("lfloor", 8970, null, true));
|
||||
//8971 rfloor ⌋ right floor
|
||||
_put(new SpecialEntity("rfloor", 8971, null, true));
|
||||
//9001 lang 〈 left-pointing angle bracket = bra
|
||||
//lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark'
|
||||
_put(new SpecialEntity("lang", 9001, null, true));
|
||||
//9002 rang 〉 right-pointing angle bracket = ket
|
||||
//rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark'
|
||||
_put(new SpecialEntity("rang", 9002, null, true));
|
||||
//9674 loz ◊ lozenge
|
||||
_put(new SpecialEntity("loz", 9674, null, true));
|
||||
//black here seems to mean filled as opposed to hollow
|
||||
//9824 spades ♠ black spade suit
|
||||
_put(new SpecialEntity("spades", 9824, null, true));
|
||||
//9827 clubs ♣ black club suit = shamrock
|
||||
_put(new SpecialEntity("clubs", 9827, null, true));
|
||||
//9829 hearts ♥ black heart suit = valentine
|
||||
_put(new SpecialEntity("hearts", 9829, null, true));
|
||||
//9830 diams ♦ black diamond suit
|
||||
_put(new SpecialEntity("diams", 9830, null, true));
|
||||
}
|
||||
_put(new SpecialEntity("amp", '&', null, false));
|
||||
_put(new SpecialEntity("lt", '<', null, false));
|
||||
_put(new SpecialEntity("gt", '>', null, false));
|
||||
_put(new SpecialEntity("quot", '"', null, false));
|
||||
// this is xml only -- apos appearing in html needs to be converted to ' or maybe ' to be universally safe
|
||||
// may need to special case for html attributes that use ' as surrounding delimeter on attribute value (instead of " ) : <a href='javascript:foo("bar'")' >wierd link</a>
|
||||
_put(new SpecialEntity("apos", '\'', "'", false));
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param seq may have a leading & and/or trailing ; ( those will be removed prior to comparision)
|
||||
* @return {@link SpecialEntity} if found.
|
||||
*/
|
||||
public SpecialEntity getSpecialEntity(String seq) {
|
||||
if (seq.length() == 0) return null;
|
||||
int startIndex = seq.charAt(0) == '&'?1:0;
|
||||
int semiIndex = seq.indexOf(';');
|
||||
String entity;
|
||||
if (semiIndex < 0) {
|
||||
entity = seq.substring(startIndex);
|
||||
} else {
|
||||
entity = seq.substring(startIndex, semiIndex);
|
||||
}
|
||||
SpecialEntity specialEntity = entities.get(entity);
|
||||
return specialEntity;
|
||||
}
|
||||
|
||||
public SpecialEntity getSpecialEntityByUnicode(int unicodeCharcode) {
|
||||
return this.entitiesByUnicodeCharcode.get(unicodeCharcode);
|
||||
}
|
||||
|
||||
public void put(SpecialEntity specialEntity) {
|
||||
_put(specialEntity);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param specialEntity
|
||||
*/
|
||||
private void _put(SpecialEntity specialEntity) {
|
||||
SpecialEntity old;
|
||||
old = entities.put(specialEntity.getKey(), specialEntity);
|
||||
if ( old != null ) {
|
||||
throw new HtmlCleanerException("replaced "+old+" with "+specialEntity);
|
||||
}
|
||||
old = entitiesByUnicodeCharcode.put(specialEntity.intValue(), specialEntity);
|
||||
if ( old != null ) {
|
||||
throw new HtmlCleanerException("replaced "+old+" with "+specialEntity);
|
||||
}
|
||||
this.maxEntityLength = Math.max(this.maxEntityLength,specialEntity.getKey().length());
|
||||
}
|
||||
public int getMaxEntityLength() {
|
||||
return maxEntityLength;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,135 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
public class SpecialEntity{
|
||||
private final String key;
|
||||
private final int intCode;
|
||||
// escaped value outputed when generating html
|
||||
private final String htmlString;
|
||||
private boolean htmlSpecialEntity;
|
||||
// escaped value when outputting html
|
||||
private final String escapedXmlString;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param key value between & and the ';' example 'amp' for '&'
|
||||
* @param intCode
|
||||
* @param htmlString
|
||||
* @param htmlSpecialEntity entity is affected by translateSpecialEntities property setting.
|
||||
*/
|
||||
public SpecialEntity(String key, int intCode, String htmlString, boolean htmlSpecialEntity) {
|
||||
this.key = key;
|
||||
this.intCode = intCode;
|
||||
String str = "&" + key +";";
|
||||
if ( htmlString != null) {
|
||||
this.htmlString = htmlString;
|
||||
} else {
|
||||
this.htmlString = str;
|
||||
}
|
||||
if ( htmlSpecialEntity ) {
|
||||
this.escapedXmlString = String.valueOf((char)this.intCode);
|
||||
} else {
|
||||
this.escapedXmlString = str;
|
||||
}
|
||||
this.htmlSpecialEntity = htmlSpecialEntity;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the key
|
||||
*/
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the intCode
|
||||
*/
|
||||
public int intValue() {
|
||||
return intCode;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the domString
|
||||
*/
|
||||
public String getHtmlString() {
|
||||
return htmlString;
|
||||
}
|
||||
|
||||
public String getEscapedXmlString() {
|
||||
return this.escapedXmlString;
|
||||
}
|
||||
|
||||
public String getEscaped(boolean htmlEscaped) {
|
||||
return htmlEscaped?this.getHtmlString():this.getEscapedXmlString();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the translateSpecialEntities
|
||||
*/
|
||||
public boolean isHtmlSpecialEntity() {
|
||||
return htmlSpecialEntity;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return {@link #intValue()} cast to an char
|
||||
*/
|
||||
public char charValue() {
|
||||
return (char) intValue();
|
||||
}
|
||||
/**
|
||||
* @return Numeric Character Reference in decimal format
|
||||
*/
|
||||
public String getDecimalNCR() {
|
||||
return "&#" + intCode + ";";
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Numeric Character Reference in hex format
|
||||
*/
|
||||
public String getHexNCR() {
|
||||
return "&#x" + Integer.toHexString(intCode) + ";";
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Escaped value of the entity
|
||||
*/
|
||||
public String getEscapedValue() {
|
||||
return "&" + key + ";";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,447 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Class contains information about single HTML tag.<br/>
|
||||
* It also contains rules for tag balancing. For each tag, list of dependent
|
||||
* tags may be defined. There are several kinds of dependencies used to reorder
|
||||
* tags:
|
||||
* <ul>
|
||||
* <li>
|
||||
* fatal tags - required outer tag - the tag will be ignored during
|
||||
* parsing (will be skipped) if this fatal tag is missing. For example, most web
|
||||
* browsers ignore elements TD, TR, TBODY if they are not in the context of TABLE tag.
|
||||
* </li>
|
||||
* <li>
|
||||
* required enclosing tags - if there is no such, it is implicitly
|
||||
* created. For example if TD is out of TR - open TR is created before.
|
||||
* </li>
|
||||
* <li>
|
||||
* forbidden tags - it is not allowed to occur inside - for example
|
||||
* FORM cannot be inside other FORM and it will be ignored during cleanup.
|
||||
* </li>
|
||||
* <li>
|
||||
* allowed children tags - for example TR allows TD and TH. If there
|
||||
* are some dependent allowed tags defined then cleaner ignores other tags, treating
|
||||
* them as not allowed, unless they are in some other relationship with this tag.
|
||||
* </li>
|
||||
* <li>
|
||||
* preferred child tag - where a child tag doesn't match, but we want to by default
|
||||
* insert an intervening tag rather than just move it outside. For example, LI in UL, TD in TR.
|
||||
* </li>
|
||||
* <li>
|
||||
* higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT.
|
||||
* </li>
|
||||
* <li>
|
||||
* tags that must be closed and copied - for example, in
|
||||
* <code><a href="#"><div>....</code> tag A must be closed before DIV but
|
||||
* copied again inside DIV.
|
||||
* </li>
|
||||
* <li>
|
||||
* tags that must be closed before closing this tag and copied again after -
|
||||
* for example, in <code><i><b>at</i> first</b> text </code>
|
||||
* tag B must be closed before closing I, but it must be copied again after resulting
|
||||
* finally in sequence: <code><i><b>at</b></i><b> first</b> text </code>.
|
||||
* </li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Tag TR for instance (table row) may define the following dependencies:
|
||||
* <ul>
|
||||
* <li>fatal tag is <code>table</code></li>
|
||||
* <li>required enclosing tag is <code>tbody</code></li>
|
||||
* <li>allowed children tags are <code>td,th</code></li>
|
||||
* <li>higher level tags are <code>thead,tfoot</code></li>
|
||||
* <li>tags that muste be closed before are <code>tr,td,th,caption,colgroup</code></li>
|
||||
* </ul>
|
||||
* meaning the following: <br>
|
||||
* <ul>
|
||||
* <li><code>tr</code> must be in context of <code>table</code>, otherwise it will be ignored,</li>
|
||||
* <li><code>tr</code> may can be directly inside <code>tbody</code>, <code>tfoot</code> and <code>thead</code>,
|
||||
* otherwise <code>tbody</code> will be implicitly created in front of it.</li>
|
||||
* <li><code>tr</code> can contain <code>td</code> and <code>th</code>, all other tags and content will be pushed out of current
|
||||
* limiting context, in the case of html tables, in front of enclosing <code>table</code> tag.</li>
|
||||
* <li>if previous open tag is one of <code>tr</code>, <code>caption</code> or <code>colgroup</code>, it will be implicitly closed.</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*/
|
||||
public class TagInfo {
|
||||
|
||||
public String getAssumedNamespace() {
|
||||
return assumedNamespace;
|
||||
}
|
||||
|
||||
public void setAssumedNamespace(String assumedNamespace) {
|
||||
this.assumedNamespace = assumedNamespace;
|
||||
}
|
||||
|
||||
public String getAssumedNamespacePrefix() {
|
||||
return assumedNamespacePrefix;
|
||||
}
|
||||
|
||||
public void setAssumedNamespacePrefix(String assumedNamespacePrefix) {
|
||||
this.assumedNamespacePrefix = assumedNamespacePrefix;
|
||||
}
|
||||
|
||||
|
||||
private String name;
|
||||
private ContentType contentType;
|
||||
private Set<String> mustCloseTags = new HashSet<String>();
|
||||
private Set<String> higherTags = new HashSet<String>();
|
||||
private Set<String> childTags = new HashSet<String>();
|
||||
private Set<String> permittedTags = new HashSet<String>();
|
||||
private Set<String> copyTags = new HashSet<String>();
|
||||
private Set<String> continueAfterTags = new HashSet<String>();
|
||||
private BelongsTo belongsTo = BelongsTo.BODY;
|
||||
private Set<String>requiredParentTags = new HashSet<String>();
|
||||
private Set<String>fatalTags = new HashSet<String>();
|
||||
private String preferredChildTag = null;
|
||||
private String assumedNamespace = null;
|
||||
private String assumedNamespacePrefix = null;
|
||||
private boolean deprecated;
|
||||
private boolean unique;
|
||||
private CloseTag closeTag;
|
||||
private Display display;
|
||||
|
||||
public TagInfo(String name, ContentType contentType, BelongsTo belongsTo, boolean deprecated, boolean unique, boolean ignorePermitted, CloseTag closeTag, Display display) {
|
||||
this.name = name;
|
||||
this.contentType = contentType;
|
||||
this.belongsTo = belongsTo;
|
||||
this.deprecated = deprecated;
|
||||
this.unique = unique;
|
||||
this.closeTag = closeTag;
|
||||
this.display = display;
|
||||
}
|
||||
|
||||
public void defineFatalTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.fatalTags.add(currTag);
|
||||
this.higherTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
public void defineRequiredEnclosingTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.requiredParentTags.add(currTag);
|
||||
this.higherTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
public void defineForbiddenTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.permittedTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
public void defineAllowedChildrenTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.childTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
public void defineHigherLevelTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.higherTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
public void defineCloseBeforeCopyInsideTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.copyTags.add(currTag);
|
||||
this.mustCloseTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
public void defineCloseInsideCopyAfterTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.continueAfterTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
public void defineCloseBeforeTags(String commaSeparatedListOfTags) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
String currTag = tokenizer.nextToken();
|
||||
this.mustCloseTags.add(currTag);
|
||||
}
|
||||
}
|
||||
|
||||
// getters and setters
|
||||
|
||||
public Display getDisplay() {
|
||||
return display;
|
||||
}
|
||||
|
||||
public void setDisplay(Display display) {
|
||||
this.display = display;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public ContentType getContentType() {
|
||||
return contentType;
|
||||
}
|
||||
|
||||
public Set<String> getMustCloseTags() {
|
||||
return mustCloseTags;
|
||||
}
|
||||
|
||||
public void setMustCloseTags(Set<String> mustCloseTags) {
|
||||
this.mustCloseTags = mustCloseTags;
|
||||
}
|
||||
|
||||
public Set<String> getHigherTags() {
|
||||
return higherTags;
|
||||
}
|
||||
|
||||
public void setHigherTags(Set<String> higherTags) {
|
||||
this.higherTags = higherTags;
|
||||
}
|
||||
|
||||
public Set<String> getChildTags() {
|
||||
return childTags;
|
||||
}
|
||||
|
||||
public void setChildTags(Set<String> childTags) {
|
||||
this.childTags = childTags;
|
||||
}
|
||||
|
||||
public Set<String> getPermittedTags() {
|
||||
return permittedTags;
|
||||
}
|
||||
|
||||
public void setPermittedTags(Set<String> permittedTags) {
|
||||
this.permittedTags = permittedTags;
|
||||
}
|
||||
|
||||
public Set<String> getCopyTags() {
|
||||
return copyTags;
|
||||
}
|
||||
|
||||
public void setCopyTags(Set<String> copyTags) {
|
||||
this.copyTags = copyTags;
|
||||
}
|
||||
|
||||
public Set<String> getContinueAfterTags() {
|
||||
return continueAfterTags;
|
||||
}
|
||||
|
||||
public void setContinueAfterTags(Set<String> continueAfterTags) {
|
||||
this.continueAfterTags = continueAfterTags;
|
||||
}
|
||||
|
||||
public Set<String> getRequiredParentTags() {
|
||||
return requiredParentTags;
|
||||
}
|
||||
|
||||
public void setRequiredParent(String requiredParent) {
|
||||
this.requiredParentTags.add(requiredParent);
|
||||
}
|
||||
|
||||
public BelongsTo getBelongsTo() {
|
||||
return belongsTo;
|
||||
}
|
||||
|
||||
public void setBelongsTo(BelongsTo belongsTo) {
|
||||
this.belongsTo = belongsTo;
|
||||
}
|
||||
|
||||
public Set<String> getFatalTags(){
|
||||
return this.fatalTags;
|
||||
}
|
||||
|
||||
public boolean isFatalTag(String tag){
|
||||
for (String fatalTag:this.fatalTags){
|
||||
if (tag.equals(fatalTag)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public void setFatalTag(String fatalTag) {
|
||||
this.fatalTags.add(fatalTag);
|
||||
}
|
||||
|
||||
public boolean isDeprecated() {
|
||||
return deprecated;
|
||||
}
|
||||
|
||||
public void setDeprecated(boolean deprecated) {
|
||||
this.deprecated = deprecated;
|
||||
}
|
||||
|
||||
public boolean isUnique() {
|
||||
return unique;
|
||||
}
|
||||
|
||||
public void setUnique(boolean unique) {
|
||||
this.unique = unique;
|
||||
}
|
||||
|
||||
public boolean isEmptyTag() {
|
||||
return ContentType.none == contentType;
|
||||
}
|
||||
|
||||
// other functionality
|
||||
|
||||
boolean allowsBody() {
|
||||
return ContentType.none != contentType;
|
||||
}
|
||||
|
||||
boolean isHigher(String tagName) {
|
||||
return higherTags.contains(tagName);
|
||||
}
|
||||
|
||||
boolean isCopy(String tagName) {
|
||||
return copyTags.contains(tagName);
|
||||
}
|
||||
|
||||
boolean hasCopyTags() {
|
||||
return !copyTags.isEmpty();
|
||||
}
|
||||
|
||||
boolean isContinueAfter(String tagName) {
|
||||
return continueAfterTags.contains(tagName);
|
||||
}
|
||||
|
||||
boolean hasPermittedTags() {
|
||||
return !permittedTags.isEmpty();
|
||||
}
|
||||
|
||||
boolean isHeadTag() {
|
||||
return belongsTo == BelongsTo.HEAD;
|
||||
}
|
||||
|
||||
boolean isHeadAndBodyTag() {
|
||||
return belongsTo == BelongsTo.HEAD || belongsTo == BelongsTo.HEAD_AND_BODY;
|
||||
}
|
||||
|
||||
boolean isMustCloseTag(TagInfo tagInfo) {
|
||||
if (tagInfo != null) {
|
||||
return mustCloseTags.contains( tagInfo.getName() ) || tagInfo.contentType == ContentType.text;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param token
|
||||
* @return true if the passed token is allowed to be nested in a Tag with this TagInfo.
|
||||
*/
|
||||
boolean allowsItem(BaseToken token) {
|
||||
if ( contentType != ContentType.none && token instanceof TagToken ) {
|
||||
TagToken tagToken = (TagToken) token;
|
||||
String tagName = tagToken.getName();
|
||||
if ( "script".equals(tagName) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
switch (contentType) {
|
||||
case all:
|
||||
if ( !childTags.isEmpty() ) {
|
||||
if ( token instanceof TagToken) {
|
||||
return childTags.contains( ((TagToken)token).getName() );
|
||||
}
|
||||
} else if ( !permittedTags.isEmpty() ) {
|
||||
if ( token instanceof TagToken) {
|
||||
return !permittedTags.contains( ((TagToken)token).getName() );
|
||||
}
|
||||
}
|
||||
return true;
|
||||
case text:
|
||||
return !(token instanceof TagToken);
|
||||
case none:
|
||||
if ( token instanceof ContentNode ) {
|
||||
// allow white space in outputed html
|
||||
return ( (ContentNode)token).isBlank();
|
||||
} else if (!(token instanceof TagToken)) {
|
||||
// allow directives.
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
boolean allowsAnything() {
|
||||
return ContentType.all == contentType && childTags.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return True if the tag can be minimized
|
||||
*/
|
||||
public boolean isMinimizedTagPermitted() {
|
||||
return this.closeTag.isMinimizedTagPermitted();
|
||||
}
|
||||
|
||||
public String getPreferredChildTag() {
|
||||
return preferredChildTag;
|
||||
}
|
||||
|
||||
public void setPreferredChildTag(String preferredChildTag) {
|
||||
this.preferredChildTag = preferredChildTag;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,889 @@
|
||||
/* Copyright (c) 2006-2014, HTMLCleaner project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
http://htmlcleaner.sourceforge.net/
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Writer;
|
||||
import java.util.*;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.htmlcleaner.conditional.ITagNodeCondition;
|
||||
import org.htmlcleaner.conditional.TagAllCondition;
|
||||
import org.htmlcleaner.conditional.TagNodeAttExistsCondition;
|
||||
import org.htmlcleaner.conditional.TagNodeAttValueCondition;
|
||||
import org.htmlcleaner.conditional.TagNodeNameCondition;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* XML node tag - basic node of the cleaned HTML tree. At the same time, it represents start tag token
|
||||
* after HTML parsing phase and before cleaning phase. After cleaning process, tree structure remains
|
||||
* containing tag nodes (TagNode class), content (text nodes - ContentNode), comments (CommentNode)
|
||||
* and optionally doctype node (DoctypeToken).
|
||||
* </p>
|
||||
*/
|
||||
public class TagNode extends TagToken implements HtmlNode {
|
||||
private final LinkedHashMap<String, String> attributes = new LinkedHashMap<String, String>();
|
||||
private final List<BaseToken> children = new ArrayList<BaseToken>();
|
||||
private DoctypeToken docType;
|
||||
private List<BaseToken> itemsToMove;
|
||||
private Map<String, String> nsDeclarations;
|
||||
|
||||
private transient boolean isFormed;
|
||||
|
||||
/**
|
||||
* Used to indicate a start tag that was auto generated because {@link TagInfo#isContinueAfter(String)}(closedTag.getName()) returned true
|
||||
* For example,
|
||||
* <pre>
|
||||
* <b><i>foo</b>bar
|
||||
* </pre>
|
||||
* would result in a new <i> being created resulting in
|
||||
* <pre>
|
||||
* <b><i>foo</i></b><i>bar</i>
|
||||
* </pre>
|
||||
* The second opening <i> tag is marked as autogenerated. This allows the autogenerated tag to be removed if it is unneeded.
|
||||
*/
|
||||
private boolean autoGenerated;
|
||||
|
||||
/**
|
||||
* This flag is set if we are using namespace aware setting, and the tagnode belongs
|
||||
* to a non-HTML namespace.
|
||||
*/
|
||||
private boolean isForeignMarkup;
|
||||
|
||||
/**
|
||||
* This flag is set if foreignMarkup is set; if it is false it means that the tagnode tree has not been built and so
|
||||
* it isn't known whether this node is a HTML node or foreign markup such as SVG.
|
||||
*/
|
||||
private boolean foreignMarkupFlagSet = false;
|
||||
|
||||
/**
|
||||
* This flag is set if attribute values should be trimmed.
|
||||
*/
|
||||
private boolean isTrimAttributeValues = true;
|
||||
|
||||
/**
|
||||
* Indicates that the node was marked to be pruned out of the tree.
|
||||
*/
|
||||
private boolean pruned;
|
||||
|
||||
/**
|
||||
* Indicates that the node is a copy of another node.
|
||||
* @see #makeCopy()
|
||||
*/
|
||||
private final boolean isCopy;
|
||||
|
||||
public TagNode(String name) {
|
||||
this(name, false);
|
||||
}
|
||||
|
||||
private TagNode(String name, boolean isCopy) {
|
||||
super(name);
|
||||
this.isCopy = isCopy;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.htmlcleaner.TagToken#getName()
|
||||
*/
|
||||
@Override
|
||||
public String getName() {
|
||||
//
|
||||
// If this is foreign markup (e.g. SVG) we return the
|
||||
// original name, otherwise we return it in lower case
|
||||
//
|
||||
if (this.isForeignMarkup){
|
||||
return name;
|
||||
} else {
|
||||
return name == null ? null: name.toLowerCase();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param attName
|
||||
* @return Value of the specified attribute, or null if it this tag doesn't contain it.
|
||||
*/
|
||||
public String getAttributeByName(String attName) {
|
||||
if (attName == null) return null;
|
||||
//
|
||||
// We have to do case-insensitive comparisons
|
||||
//
|
||||
return attName != null ? (String) getAttributesInLowerCase().get(attName.toLowerCase()) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the attributes of the tagnode.
|
||||
*
|
||||
* @return Map instance containing all attribute name/value pairs.
|
||||
*/
|
||||
public Map<String, String> getAttributes() {
|
||||
return new LinkedHashMap<String, String>(this.attributes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the attributes of the tagnode in lower case.
|
||||
*
|
||||
* @return Map instance containing all attribute name/value pairs, with attribute names transformed to lower case
|
||||
*/
|
||||
public Map<String, String> getAttributesInLowerCase(){
|
||||
return attributesToLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace the current set of attributes with a new set.
|
||||
* @param attributes
|
||||
*/
|
||||
public void setAttributes(Map<String, String> attributes) {
|
||||
|
||||
//
|
||||
// If we haven't yet built the tree, we don't know if this
|
||||
// element is "foreign markup". In this case we don't want
|
||||
// to overwrite attributes with the same version with a lower
|
||||
// cased name when its set by the transforms processor.
|
||||
//
|
||||
|
||||
//
|
||||
// We're calling this method after the tree has been built,
|
||||
// so its safe to just set the attributes
|
||||
//
|
||||
if (foreignMarkupFlagSet){
|
||||
replaceAttributes(attributes);
|
||||
} else {
|
||||
//
|
||||
// The foreign markup flag hasn't been set, so instead of just
|
||||
// replacing the contents of the attributes map, we iterate
|
||||
// over it and use the original case name from the existing
|
||||
// attributes map where it exists
|
||||
//
|
||||
|
||||
//
|
||||
// First create a map to hold the processed map contents
|
||||
//
|
||||
LinkedHashMap<String, String> processedAttributes = new LinkedHashMap<String, String>();
|
||||
|
||||
//
|
||||
// Iterate over the keys in the map provided by the transforms processor
|
||||
// and add them to the set of processed keys
|
||||
//
|
||||
for (Map.Entry<String, String> entry : attributes.entrySet()){
|
||||
String key = entry.getKey();
|
||||
if (Thread.currentThread().isInterrupted()) {
|
||||
// Interruption: if the attributes.keySet() is large this loop will take a lot of time
|
||||
handleInterruption();
|
||||
return;
|
||||
}
|
||||
String keyToSet = key; // the key to set
|
||||
String value = attributes.get(key); // the value to set
|
||||
//value = Utils.deserializeEntities(value, true);
|
||||
|
||||
//
|
||||
// Check to see if the key exists in the current attribute set
|
||||
// with different casing. If so, we keep the casing
|
||||
//
|
||||
if (!foreignMarkupFlagSet){
|
||||
for (String existingKey: this.attributes.keySet()){
|
||||
if (existingKey.equalsIgnoreCase(key)){
|
||||
keyToSet = existingKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
//
|
||||
// If we have duplicates, keep the first value
|
||||
//
|
||||
if (!processedAttributes.containsKey(keyToSet)){
|
||||
processedAttributes.put(keyToSet, value);
|
||||
}
|
||||
|
||||
}
|
||||
replaceAttributes(processedAttributes);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Clears existing attributes and puts replacement attributes
|
||||
* @param attributes the attributes to set
|
||||
*/
|
||||
private void replaceAttributes(Map<String, String> attributes){
|
||||
|
||||
this.attributes.clear();
|
||||
this.attributes.putAll(attributes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks existence of specified attribute.
|
||||
*
|
||||
* @param attName
|
||||
* @return true if TagNode has attribute
|
||||
*/
|
||||
public boolean hasAttribute(String attName) {
|
||||
if (attName == null) return false;
|
||||
|
||||
//
|
||||
// We have to do case-insensitive comparisons
|
||||
//
|
||||
for (String key: attributes.keySet()){
|
||||
if (key.equalsIgnoreCase(attName)) return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds specified attribute to this tag or overrides existing one.
|
||||
*
|
||||
* @param attName
|
||||
* @param attValue
|
||||
*/
|
||||
@Override
|
||||
public void addAttribute(String attName, String attValue) {
|
||||
if (attName != null) {
|
||||
String trim = attName.trim();
|
||||
if (!isForeignMarkup && foreignMarkupFlagSet) trim = trim.toLowerCase();
|
||||
String value = attValue == null ? "" : attValue;
|
||||
if (isTrimAttributeValues) value = value.trim().replaceAll("\\p{Cntrl}", " ");
|
||||
if (trim.length() != 0) {
|
||||
//
|
||||
// If there is already an entry, keep the existing value rather than
|
||||
// overwrite it.
|
||||
//
|
||||
if (!attributes.containsKey(trim)){
|
||||
attributes.put(trim, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes specified attribute from this tag.
|
||||
*
|
||||
* @param attName
|
||||
*/
|
||||
public void removeAttribute(String attName) {
|
||||
if (attName != null && !"".equals(attName.trim())) {
|
||||
attributes.remove(attName.toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return List of child TagNode objects.
|
||||
* @deprecated use {@link TagNode#getChildTagList()}, will be refactored and possibly removed in
|
||||
* future versions. TODO This method should be refactored because is does not
|
||||
* properly match the commonly used Java's getter/setter strategy.
|
||||
*/
|
||||
@Deprecated
|
||||
public List<TagNode> getChildren() {
|
||||
return getChildTagList();
|
||||
}
|
||||
|
||||
public void setChildren(List<? extends BaseToken> children) {
|
||||
this.children.clear();
|
||||
this.children.addAll(children);
|
||||
}
|
||||
|
||||
public List<? extends BaseToken> getAllChildren() {
|
||||
return children;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return List of child TagNode objects.
|
||||
*/
|
||||
public List<TagNode> getChildTagList() {
|
||||
List<TagNode> childTagList = new ArrayList<TagNode>();
|
||||
for (Object item: children) {
|
||||
if (item instanceof TagNode) {
|
||||
childTagList.add((TagNode) item);
|
||||
}
|
||||
}
|
||||
|
||||
return childTagList;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Whether this node has child elements or not.
|
||||
*/
|
||||
public boolean hasChildren() {
|
||||
return !children.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return An array of child TagNode instances.
|
||||
*/
|
||||
public TagNode[] getChildTags() {
|
||||
List<TagNode> childTagList = getChildTagList();
|
||||
TagNode childrenArray[] = new TagNode[childTagList.size()];
|
||||
for (int i = 0; i < childTagList.size(); i++) {
|
||||
childrenArray[i] = (TagNode) childTagList.get(i);
|
||||
}
|
||||
|
||||
return childrenArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Text content of this node and it's subelements.
|
||||
*/
|
||||
public CharSequence getText() {
|
||||
StringBuilder text = new StringBuilder();
|
||||
for (Object item :children) {
|
||||
if (item instanceof ContentNode) {
|
||||
text.append(((ContentNode) item).getContent());
|
||||
} else if (item instanceof TagNode) {
|
||||
CharSequence subtext = ((TagNode) item).getText();
|
||||
text.append(subtext);
|
||||
}
|
||||
}
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param child Child to find index of
|
||||
* @return Index of the specified child node inside this node's children, -1 if node is not the
|
||||
* child
|
||||
*/
|
||||
public int getChildIndex(HtmlNode child) {
|
||||
int index = 0;
|
||||
for (Object curr : children) {
|
||||
if (curr == child) {
|
||||
return index;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts specified node at specified position in array of children
|
||||
*
|
||||
* @param index
|
||||
* @param childToAdd
|
||||
*/
|
||||
public void insertChild(int index, HtmlNode childToAdd) {
|
||||
children.add(index, childToAdd);
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts specified node in the list of children before specified child
|
||||
*
|
||||
* @param node Child before which to insert new node
|
||||
* @param nodeToInsert Node to be inserted at specified position
|
||||
*/
|
||||
public void insertChildBefore(HtmlNode node, HtmlNode nodeToInsert) {
|
||||
int index = getChildIndex(node);
|
||||
if (index >= 0) {
|
||||
insertChild(index, nodeToInsert);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts specified node in the list of children after specified child
|
||||
*
|
||||
* @param node Child after which to insert new node
|
||||
* @param nodeToInsert Node to be inserted at specified position
|
||||
*/
|
||||
public void insertChildAfter(HtmlNode node, HtmlNode nodeToInsert) {
|
||||
int index = getChildIndex(node);
|
||||
if (index >= 0) {
|
||||
insertChild(index + 1, nodeToInsert);
|
||||
}
|
||||
}
|
||||
|
||||
public DoctypeToken getDocType() {
|
||||
return docType;
|
||||
}
|
||||
|
||||
public void setDocType(DoctypeToken docType) {
|
||||
this.docType = docType;
|
||||
}
|
||||
|
||||
public void addChild(Object child) {
|
||||
if (child == null) {
|
||||
return;
|
||||
}
|
||||
if (child instanceof List) {
|
||||
addChildren((List) child);
|
||||
} else if (child instanceof ProxyTagNode) {
|
||||
children.add(((ProxyTagNode) child).getToken());
|
||||
} else if (child instanceof BaseToken){
|
||||
children.add((BaseToken)child);
|
||||
if (child instanceof TagNode) {
|
||||
TagNode childTagNode = (TagNode) child;
|
||||
childTagNode.parent = this;
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("Attempted to add invalid child object to TagNode; class="+child.getClass());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add all elements from specified list to this node.
|
||||
*
|
||||
* @param newChildren
|
||||
*/
|
||||
public void addChildren(List newChildren) {
|
||||
if (newChildren != null) {
|
||||
for (Object child: newChildren) {
|
||||
addChild(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds first element in the tree that satisfy specified condition.
|
||||
*
|
||||
* @param condition
|
||||
* @param isRecursive
|
||||
* @return First TagNode found, or null if no such elements.
|
||||
*/
|
||||
private TagNode findElement(ITagNodeCondition condition, boolean isRecursive) {
|
||||
if (condition != null) {
|
||||
for (Object item : children) {
|
||||
if (item instanceof TagNode) {
|
||||
TagNode currNode = (TagNode) item;
|
||||
if (condition.satisfy(currNode)) {
|
||||
return currNode;
|
||||
} else if (isRecursive) {
|
||||
TagNode inner = currNode.findElement(condition, isRecursive);
|
||||
if (inner != null) {
|
||||
return inner;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all elements in the tree that satisfy specified condition.
|
||||
* @param condition
|
||||
* @param isRecursive
|
||||
* @return List of TagNode instances.
|
||||
*/
|
||||
private List<TagNode> findMatchingTagNodes(ITagNodeCondition condition, boolean isRecursive){
|
||||
List<TagNode> result = new LinkedList<TagNode>();
|
||||
if (condition == null) {
|
||||
return result;
|
||||
}
|
||||
|
||||
for (Object item : children) {
|
||||
if (item instanceof TagNode) {
|
||||
TagNode currNode = (TagNode) item;
|
||||
if (condition.satisfy(currNode)) {
|
||||
result.add(currNode);
|
||||
}
|
||||
if (isRecursive) {
|
||||
List<TagNode> innerList = currNode.findMatchingTagNodes(condition, isRecursive);
|
||||
if (innerList != null && innerList.size() > 0) {
|
||||
result.addAll(innerList);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all elements in the tree that satisfy specified condition.
|
||||
*
|
||||
* @param condition
|
||||
* @param isRecursive
|
||||
* @return List of TagNode instances with specified name.
|
||||
*/
|
||||
public List<? extends TagNode> getElementList(ITagNodeCondition condition, boolean isRecursive) {
|
||||
return findMatchingTagNodes(condition, isRecursive);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param condition
|
||||
* @param isRecursive
|
||||
* @return The array of all subelements that satisfy specified condition.
|
||||
*/
|
||||
private TagNode[] getElements(ITagNodeCondition condition, boolean isRecursive) {
|
||||
final List<TagNode> list = findMatchingTagNodes(condition, isRecursive);
|
||||
TagNode array[];
|
||||
if (list == null) {
|
||||
array = new TagNode[0];
|
||||
} else {
|
||||
array = (TagNode[]) list.toArray(new TagNode[list.size()]);
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
public List<? extends TagNode> getAllElementsList(boolean isRecursive) {
|
||||
return getElementList(new TagAllCondition(), isRecursive);
|
||||
}
|
||||
|
||||
public TagNode[] getAllElements(boolean isRecursive) {
|
||||
return getElements(new TagAllCondition(), isRecursive);
|
||||
}
|
||||
|
||||
public TagNode findElementByName(String findName, boolean isRecursive) {
|
||||
return findElement(new TagNodeNameCondition(findName), isRecursive);
|
||||
}
|
||||
|
||||
public List<? extends TagNode> getElementListByName(String findName, boolean isRecursive) {
|
||||
return getElementList(new TagNodeNameCondition(findName), isRecursive);
|
||||
}
|
||||
|
||||
public TagNode[] getElementsByName(String findName, boolean isRecursive) {
|
||||
return getElements(new TagNodeNameCondition(findName), isRecursive);
|
||||
}
|
||||
|
||||
public TagNode findElementHavingAttribute(String attName, boolean isRecursive) {
|
||||
return findElement(new TagNodeAttExistsCondition(attName), isRecursive);
|
||||
}
|
||||
|
||||
public List<? extends TagNode> getElementListHavingAttribute(String attName, boolean isRecursive) {
|
||||
return getElementList(new TagNodeAttExistsCondition(attName), isRecursive);
|
||||
}
|
||||
|
||||
public TagNode[] getElementsHavingAttribute(String attName, boolean isRecursive) {
|
||||
return getElements(new TagNodeAttExistsCondition(attName), isRecursive);
|
||||
}
|
||||
|
||||
public TagNode findElementByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
|
||||
return findElement(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
|
||||
}
|
||||
|
||||
public List<? extends TagNode> getElementListByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
|
||||
return getElementList(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
|
||||
}
|
||||
|
||||
public TagNode[] getElementsByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
|
||||
return getElements(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates XPath expression on give node. <br>
|
||||
* <em>
|
||||
* This is not fully supported XPath parser and evaluator.
|
||||
* Examples below show supported elements:
|
||||
* </em> <code>
|
||||
* <ul>
|
||||
* <li>//div//a</li>
|
||||
* <li>//div//a[@id][@class]</li>
|
||||
* <li>/body/*[1]/@type</li>
|
||||
* <li>//div[3]//a[@id][@href='r/n4']</li>
|
||||
* <li>//div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a</li>
|
||||
* <li>//div[2]/@*[2]</li>
|
||||
* <li>data(//div//a[@id][@class])</li>
|
||||
* <li>//p/last()</li>
|
||||
* <li>//body//div[3][@class]//span[12.2<position()]/@id</li>
|
||||
* <li>data(//a['v' < @id])</li>
|
||||
* </ul>
|
||||
* </code>
|
||||
*
|
||||
* @param xPathExpression
|
||||
* @return result of XPather evaluation.
|
||||
* @throws XPatherException
|
||||
*/
|
||||
public Object[] evaluateXPath(String xPathExpression) throws XPatherException {
|
||||
return new XPather(xPathExpression).evaluateAgainstNode(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove this node from the tree.
|
||||
*
|
||||
* @return True if element is removed (if it is not root node).
|
||||
*/
|
||||
public boolean removeFromTree() {
|
||||
return parent != null ? parent.removeChild(this) : false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove specified child element from this node.
|
||||
*
|
||||
* @param child
|
||||
* @return True if child object existed in the children list.
|
||||
*/
|
||||
public boolean removeChild(Object child) {
|
||||
return this.children.remove(child);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes all children (subelements and text content).
|
||||
*/
|
||||
public void removeAllChildren() {
|
||||
this.children.clear();
|
||||
}
|
||||
|
||||
void addItemForMoving(Object item) {
|
||||
if (itemsToMove == null) {
|
||||
itemsToMove = new ArrayList<BaseToken>();
|
||||
}
|
||||
if (item instanceof BaseToken){
|
||||
itemsToMove.add((BaseToken)item);
|
||||
} else {
|
||||
throw new RuntimeException("Attempt to add invalid item for moving; class="+item.getClass());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
List<? extends BaseToken> getItemsToMove() {
|
||||
return itemsToMove;
|
||||
}
|
||||
|
||||
void setItemsToMove(List<BaseToken> itemsToMove) {
|
||||
this.itemsToMove = itemsToMove;
|
||||
}
|
||||
|
||||
boolean isFormed() {
|
||||
return isFormed;
|
||||
}
|
||||
|
||||
void setFormed(boolean isFormed) {
|
||||
this.isFormed = isFormed;
|
||||
}
|
||||
|
||||
void setFormed() {
|
||||
setFormed(true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param autoGenerated the autoGenerated to set
|
||||
*/
|
||||
public void setAutoGenerated(boolean autoGenerated) {
|
||||
this.autoGenerated = autoGenerated;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the autoGenerated
|
||||
*/
|
||||
public boolean isAutoGenerated() {
|
||||
return autoGenerated;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true, if node was marked to be pruned.
|
||||
*/
|
||||
public boolean isPruned() {
|
||||
return pruned;
|
||||
}
|
||||
|
||||
public void setPruned(boolean pruned) {
|
||||
this.pruned = pruned;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
if (!isPruned()) {
|
||||
for (Object child : this.children) {
|
||||
if (child instanceof TagNode) {
|
||||
if (!((TagNode) child).isPruned()) {
|
||||
return false;
|
||||
}
|
||||
} else if (child instanceof ContentNode) {
|
||||
if (!((ContentNode) child).isBlank()) {
|
||||
return false;
|
||||
}
|
||||
} else if (child instanceof CommentNode) {
|
||||
// ideally could be discarded - however standard practice is to include browser specific commands in comments. :-(
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds namespace declaration to the node
|
||||
*
|
||||
* @param nsPrefix Namespace prefix
|
||||
* @param nsURI Namespace URI
|
||||
*/
|
||||
public void addNamespaceDeclaration(String nsPrefix, String nsURI) {
|
||||
if (nsDeclarations == null) {
|
||||
nsDeclarations = new TreeMap<String, String>();
|
||||
}
|
||||
nsDeclarations.put(nsPrefix, nsURI);
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect all prefixes in namespace declarations up the path to the document root from the
|
||||
* specified node
|
||||
*
|
||||
* @param prefixes Set of prefixes to be collected
|
||||
*/
|
||||
void collectNamespacePrefixesOnPath(Set<String> prefixes) {
|
||||
Map<String, String> nsDeclarations = getNamespaceDeclarations();
|
||||
if (nsDeclarations != null) {
|
||||
for (String prefix : nsDeclarations.keySet()) {
|
||||
prefixes.add(prefix);
|
||||
}
|
||||
}
|
||||
if (parent != null) {
|
||||
parent.collectNamespacePrefixesOnPath(prefixes);
|
||||
}
|
||||
}
|
||||
|
||||
String getNamespaceURIOnPath(String nsPrefix) {
|
||||
if (nsDeclarations != null) {
|
||||
for (Map.Entry<String, String> nsEntry : nsDeclarations.entrySet()) {
|
||||
String currName = nsEntry.getKey();
|
||||
if (currName.equals(nsPrefix) || ("".equals(currName) && nsPrefix == null)) {
|
||||
return nsEntry.getValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (parent != null) {
|
||||
return parent.getNamespaceURIOnPath(nsPrefix);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Map of namespace declarations for this node
|
||||
*/
|
||||
public Map<String, String> getNamespaceDeclarations() {
|
||||
return nsDeclarations;
|
||||
}
|
||||
|
||||
public void serialize(Serializer serializer, Writer writer) throws IOException {
|
||||
serializer.serialize(this, writer);
|
||||
}
|
||||
|
||||
public TagNode makeCopy() {
|
||||
TagNode copy = new TagNode(name, true);
|
||||
copy.attributes.putAll(attributes);
|
||||
return copy;
|
||||
}
|
||||
|
||||
public boolean isCopy() {
|
||||
return isCopy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverses the tree and performs visitor's action on each node. It stops when it finishes all
|
||||
* the tree or when visitor returns false.
|
||||
*
|
||||
* @param visitor TagNodeVisitor implementation
|
||||
*/
|
||||
public void traverse(TagNodeVisitor visitor) {
|
||||
traverseInternally(visitor);
|
||||
}
|
||||
|
||||
private boolean traverseInternally(TagNodeVisitor visitor) {
|
||||
if (visitor != null) {
|
||||
boolean hasParent = parent != null;
|
||||
boolean toContinue = visitor.visit(parent, this);
|
||||
|
||||
if (!toContinue) {
|
||||
return false; // if visitor stops traversal
|
||||
} else if (hasParent && parent == null) {
|
||||
return true; // if this node is pruned from the tree during the visit, then don't go deeper
|
||||
}
|
||||
for (Object child : children.toArray()) { // make an array to avoid ConcurrentModificationException when some node is cut
|
||||
if (child instanceof TagNode) {
|
||||
toContinue = ((TagNode) child).traverseInternally(visitor);
|
||||
} else if (child instanceof ContentNode) {
|
||||
toContinue = visitor.visit(this, (ContentNode) child);
|
||||
} else if (child instanceof CommentNode) {
|
||||
toContinue = visitor.visit(this, (CommentNode) child);
|
||||
}
|
||||
if (!toContinue) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the isForeignMarkup
|
||||
*/
|
||||
public boolean isForeignMarkup() {
|
||||
return isForeignMarkup;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param isForeignMarkup the isForeignMarkup to set
|
||||
*/
|
||||
public void setForeignMarkup(boolean isForeignMarkup) {
|
||||
foreignMarkupFlagSet = true;
|
||||
this.isForeignMarkup = isForeignMarkup;
|
||||
|
||||
//
|
||||
// if set to false, change all existing attributes of this
|
||||
// element to lowercase.
|
||||
//
|
||||
if (!isForeignMarkup){
|
||||
this.replaceAttributes(getAttributesInLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the isTrimAttributeValues
|
||||
*/
|
||||
public boolean isTrimAttributeValues() {
|
||||
return isTrimAttributeValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param isTrimAttributeValues the isTrimAttributeValues to set
|
||||
*/
|
||||
public void setTrimAttributeValues(boolean isTrimAttributeValues) {
|
||||
this.isTrimAttributeValues = isTrimAttributeValues;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of the set of attributes for this node with lowercase
|
||||
* names. Where there are duplicate attributes (e.g. class, CLASS) the first
|
||||
* value is retained.
|
||||
* @return a map of attributes in key/value pairs with names in lowercase
|
||||
*/
|
||||
private Map<String, String> attributesToLowerCase(){
|
||||
Map<String, String> lowerCaseAttributes = new LinkedHashMap<String, String>();
|
||||
for (Entry<String, String> entry: attributes.entrySet()){
|
||||
String key = entry.getKey();
|
||||
if (!lowerCaseAttributes.containsKey(key.toLowerCase())){
|
||||
lowerCaseAttributes.put(key.toLowerCase(), attributes.get(key));
|
||||
}
|
||||
}
|
||||
return lowerCaseAttributes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called whenver the thread is interrupted. Currently this is a
|
||||
* placeholder, but could hold cleanup methods and user interaction
|
||||
*/
|
||||
private void handleInterruption(){
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* Defines action to be performed on TagNodes
|
||||
*/
|
||||
public interface TagNodeVisitor {
|
||||
|
||||
/**
|
||||
* Action to be performed on single node in the tree
|
||||
* @param parentNode Parent of tagNode
|
||||
* @param htmlNode node visited
|
||||
* @return True if tree traversal should be continued, false if it has to stop.
|
||||
*/
|
||||
public boolean visit(TagNode parentNode, HtmlNode htmlNode);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* Contains information about a single open tag
|
||||
*/
|
||||
|
||||
class TagPos {
|
||||
|
||||
int position;
|
||||
String name;
|
||||
TagInfo info;
|
||||
|
||||
TagPos(int position, String name, TagInfo tagInfo, CleanTimeValues cleanTimeValues) {
|
||||
this.position = position;
|
||||
this.name = name;
|
||||
this.info = tagInfo;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,66 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
|
||||
/**
|
||||
* <p>HTML tag token - descendants are start (TagNode) and end token (EndTagToken).</p>
|
||||
*/
|
||||
public abstract class TagToken extends BaseHtmlNode {
|
||||
|
||||
protected String name;
|
||||
|
||||
public TagToken() {
|
||||
}
|
||||
|
||||
public TagToken(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return name;
|
||||
}
|
||||
|
||||
abstract void addAttribute(String attName, String attValue);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,231 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Describes how specified tag is transformed to another one, or is ignored during parsing
|
||||
*/
|
||||
public class TagTransformation {
|
||||
public static String VAR_START = "${";
|
||||
public static String VAR_END = "}";
|
||||
private String sourceTag;
|
||||
private String destTag;
|
||||
private boolean preserveSourceAttributes;
|
||||
private Map<String, String> attributeTransformations = new LinkedHashMap<String, String>();
|
||||
private List<AttributeTransformation> attributePatternTransformations = new ArrayList<AttributeTransformation>();
|
||||
public TagTransformation() {
|
||||
this.preserveSourceAttributes = true;
|
||||
}
|
||||
/**
|
||||
* Creates new tag transformation from source tag to target tag specifying whether
|
||||
* source tag attributes are preserved.
|
||||
* @param sourceTag Name of the tag to be transformed.
|
||||
* @param destTag Name of tag to which source tag is to be transformed.
|
||||
* @param preserveSourceAttributes Tells whether source tag attributes are preserved in transformation.
|
||||
*/
|
||||
public TagTransformation(String sourceTag, String destTag, boolean preserveSourceAttributes) {
|
||||
this.sourceTag = sourceTag.toLowerCase();
|
||||
if (destTag == null) {
|
||||
this.destTag = null;
|
||||
} else {
|
||||
this.destTag = Utils.isValidXmlIdentifier(destTag) ? destTag.toLowerCase() : sourceTag;
|
||||
}
|
||||
this.preserveSourceAttributes = preserveSourceAttributes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates new tag transformation from source tag to target tag preserving
|
||||
* all source tag attributes.
|
||||
* @param sourceTag Name of the tag to be transformed.
|
||||
* @param destTag Name of tag to which source tag is to be transformed.
|
||||
*/
|
||||
public TagTransformation(String sourceTag, String destTag) {
|
||||
this(sourceTag, destTag, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates new tag transformation in which specified tag will be skipped (ignored)
|
||||
* during parsing process.
|
||||
* @param sourceTag
|
||||
*/
|
||||
public TagTransformation(String sourceTag) {
|
||||
this(sourceTag, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds new attribute transformation to this tag transformation. It tells how destination
|
||||
* attribute will look like. Small templating mechanism is used to describe attribute value:
|
||||
* all names between ${ and } inside the template are evaluated against source tag attributes.
|
||||
* That way one can make attribute values consist of mix of source tag attributes.
|
||||
*
|
||||
* @param targetAttName Name of the destination attribute
|
||||
* @param transformationDesc Template describing attribute value.
|
||||
*/
|
||||
public void addAttributeTransformation(String targetAttName, String transformationDesc) {
|
||||
attributeTransformations.put(targetAttName.toLowerCase(), transformationDesc);
|
||||
}
|
||||
public void addAttributePatternTransformation(Pattern attNamePattern, String transformationDesc) {
|
||||
attributePatternTransformations.add(new AttributeTransformationPatternImpl(attNamePattern, null, transformationDesc));
|
||||
}
|
||||
public void addAttributePatternTransformation(Pattern attNamePattern, Pattern attValuePattern, String transformationDesc) {
|
||||
addAttributePatternTransformation(new AttributeTransformationPatternImpl(attNamePattern, attValuePattern, transformationDesc));
|
||||
}
|
||||
/**
|
||||
* @param attributeTransformation
|
||||
*/
|
||||
public void addAttributePatternTransformation(AttributeTransformation attributeTransformation) {
|
||||
if (attributePatternTransformations == null) {
|
||||
attributePatternTransformations = new ArrayList<AttributeTransformation>();
|
||||
}
|
||||
attributePatternTransformations.add(attributeTransformation);
|
||||
}
|
||||
/**
|
||||
* Adds new attribute transformation in which destination attrbute will not exists
|
||||
* (simply removes it from list of attributes).
|
||||
* @param targetAttName
|
||||
*/
|
||||
public void addAttributeTransformation(String targetAttName) {
|
||||
addAttributeTransformation(targetAttName, null);
|
||||
}
|
||||
|
||||
boolean hasAttributeTransformations() {
|
||||
return attributeTransformations != null || attributePatternTransformations != null;
|
||||
}
|
||||
|
||||
String getSourceTag() {
|
||||
return sourceTag;
|
||||
}
|
||||
|
||||
String getDestTag() {
|
||||
return destTag;
|
||||
}
|
||||
|
||||
boolean isPreserveSourceAttributes() {
|
||||
return preserveSourceAttributes;
|
||||
}
|
||||
|
||||
Map<String, String> getAttributeTransformations() {
|
||||
return attributeTransformations;
|
||||
}
|
||||
/**
|
||||
* @param attributes
|
||||
*/
|
||||
public Map<String, String> applyTagTransformations(Map<String, String> attributes) {
|
||||
boolean isPreserveSourceAtts = isPreserveSourceAttributes();
|
||||
boolean hasAttTransforms = hasAttributeTransformations();
|
||||
if ( hasAttTransforms || !isPreserveSourceAtts) {
|
||||
Map<String, String> newAttributes = isPreserveSourceAtts ? new LinkedHashMap<String, String>(attributes) : new LinkedHashMap<String, String>();
|
||||
if (hasAttTransforms) {
|
||||
Map<String, String> map = getAttributeTransformations();
|
||||
Iterator<Map.Entry<String, String>> iterator = map.entrySet().iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Map.Entry<String, String> entry = iterator.next();
|
||||
String attName = (String) entry.getKey();
|
||||
String template = (String) entry.getValue();
|
||||
if (template == null) {
|
||||
newAttributes.remove(attName);
|
||||
} else {
|
||||
String attValue = evaluateTemplate(template, attributes);
|
||||
newAttributes.put(attName, attValue);
|
||||
}
|
||||
}
|
||||
|
||||
for(AttributeTransformation attributeTransformation: this.attributePatternTransformations) {
|
||||
for(Map.Entry<String, String> entry1: attributes.entrySet()) {
|
||||
String attName = entry1.getKey();
|
||||
if (attributeTransformation.satisfy(attName, entry1.getValue())) {
|
||||
String template = attributeTransformation.getTemplate();
|
||||
if (template == null) {
|
||||
newAttributes.remove(attName);
|
||||
} else {
|
||||
String attValue = evaluateTemplate(template, attributes);
|
||||
newAttributes.put(attName, attValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return newAttributes;
|
||||
} else {
|
||||
return attributes;
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Evaluates string template for specified map of variables. Template string can contain
|
||||
* dynamic parts in the form of ${VARNAME}. Each such part is replaced with value of the
|
||||
* variable if such exists in the map, or with empty string otherwise.
|
||||
*
|
||||
* @param template Template string
|
||||
* @param variables Map of variables (can be null)
|
||||
* @return Evaluated string
|
||||
*/
|
||||
public String evaluateTemplate(String template, Map<String, String> variables) {
|
||||
if (template == null) {
|
||||
return template;
|
||||
}
|
||||
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
||||
int startIndex = template.indexOf(VAR_START);
|
||||
int endIndex = -1;
|
||||
|
||||
while (startIndex >= 0 && startIndex < template.length()) {
|
||||
result.append( template.substring(endIndex + 1, startIndex) );
|
||||
endIndex = template.indexOf(VAR_END, startIndex);
|
||||
|
||||
if (endIndex > startIndex) {
|
||||
String varName = template.substring(startIndex + VAR_START.length(), endIndex);
|
||||
Object resultObj = variables != null ? variables.get(varName.toLowerCase()) : "";
|
||||
result.append( resultObj == null ? "" : resultObj.toString() );
|
||||
}
|
||||
|
||||
startIndex = template.indexOf( VAR_START, Math.max(endIndex + VAR_END.length(), startIndex + 1) );
|
||||
}
|
||||
|
||||
result.append( template.substring(endIndex + 1) );
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
/* Copyright (c) 2006-2019, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.StringWriter;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.w3c.dom.Document;
|
||||
|
||||
/**
|
||||
* A traversal-based serializer for DOM; used to avoid recursion and stack overflow for large
|
||||
* HTML documents.
|
||||
*/
|
||||
public class TraversalDomSerializer {
|
||||
|
||||
private CleanerProperties props;
|
||||
|
||||
/**
|
||||
* Whether XML entities should be escaped or not.
|
||||
*/
|
||||
protected boolean escapeXml = true;
|
||||
protected boolean deserializeCdataEntities = false;
|
||||
protected boolean strictErrorChecking = true;
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
* @param escapeXml if true then escape XML entities
|
||||
* @param deserializeCdataEntities if true then deserialize entities in CData sections
|
||||
* @param strictErrorChecking if false then Document strict error checking is turned off
|
||||
*/
|
||||
public TraversalDomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
this.deserializeCdataEntities = deserializeCdataEntities;
|
||||
this.strictErrorChecking = strictErrorChecking;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
* @param escapeXml if true then escape XML entities
|
||||
* @param deserializeCdataEntities if true then deserialize entities in CData sections
|
||||
*/
|
||||
public TraversalDomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities) {
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
this.deserializeCdataEntities = deserializeCdataEntities;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
* @param escapeXml if true then escape XML entities
|
||||
*/
|
||||
public TraversalDomSerializer(CleanerProperties props, boolean escapeXml) {
|
||||
this.props = props;
|
||||
this.escapeXml = escapeXml;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
|
||||
*/
|
||||
public TraversalDomSerializer(CleanerProperties props) {
|
||||
this.props = props;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param rootNode the HTML Cleaner root node to serialize
|
||||
* @return the W3C Document object
|
||||
* @throws ParserConfigurationException if there's an error during serialization
|
||||
*/
|
||||
public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
|
||||
DomBuilder builder = new DomBuilder(props, escapeXml, deserializeCdataEntities, strictErrorChecking);
|
||||
XmlTraversor.traverse(builder, rootNode);
|
||||
return builder.getDocument();
|
||||
}
|
||||
|
||||
public static String toString(Document doc) throws TransformerException, ParserConfigurationException{
|
||||
DOMSource domSource = new DOMSource(doc);
|
||||
StringWriter writer = new StringWriter();
|
||||
StreamResult result = new StreamResult(writer);
|
||||
TransformerFactory tf = TransformerFactory.newInstance();
|
||||
Transformer transformer = tf.newTransformer();
|
||||
transformer.transform(domSource, result);
|
||||
return writer.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,907 @@
|
||||
/* Copyright (c) 2006-2019, the HtmlCleaner project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* <p>Common utilities.</p>
|
||||
*
|
||||
* Created by: Vladimir Nikic<br/>
|
||||
* Date: November, 2006.
|
||||
*/
|
||||
public class Utils {
|
||||
|
||||
static final String VALID_XML_IDENTIFIER_START_CHAR_REGEX = "^[:A-Z_a-z\\u00C0\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02ff\\u0370-\\u037d"
|
||||
+ "\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f\\u2c00-\\u2fef\\u3001-\\ud7ff"
|
||||
+ "\\uf900-\\ufdcf\\ufdf0-\\ufffd\\x{10000}-\\x{EFFFF}]";
|
||||
static final Pattern VALID_XML_IDENTIFIER_START_CHAR_PATTERN =
|
||||
compileUnicodePattern(VALID_XML_IDENTIFIER_START_CHAR_REGEX);
|
||||
|
||||
/*
|
||||
The relevant production from the spec is http://www.w3.org/TR/xml/#NT-Name
|
||||
Name ::== NameStartChar NameChar *
|
||||
NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
|
||||
NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
|
||||
*/
|
||||
static final String VALID_XML_IDENTIFIER_CHAR_REGEX =
|
||||
"^[:A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02ff\\u0370-\\u037d"
|
||||
+ "\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f\\u2c00-\\u2fef\\u3001-\\ud7ff"
|
||||
+ "\\uf900-\\ufdcf\\ufdf0-\\ufffd\\x{10000}-\\x{EFFFF}]"
|
||||
+ "[:A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6"
|
||||
+ "\\u00F8-\\u02ff\\u0370-\\u037d\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f"
|
||||
+ "\\u2c00-\\u2fef\\u3001-\\udfff\\uf900-\\ufdcf\\ufdf0-\\ufffd\\-\\.0-9"
|
||||
+ "\\u00b7\\u0300-\\u036f\\u203f-\\u2040]*\\Z";
|
||||
static final Pattern VALID_XML_IDENTIFIER_CHAR_PATTERN =
|
||||
compileUnicodePattern(VALID_XML_IDENTIFIER_CHAR_REGEX);
|
||||
|
||||
|
||||
/**
|
||||
* Removes the first newline and last newline (if present) of a string
|
||||
* @param str
|
||||
* @return
|
||||
*/
|
||||
static String bchomp(final String str){
|
||||
return chomp(lchomp(str));
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the last newline (if present) of a string
|
||||
* @param str
|
||||
* @return
|
||||
*/
|
||||
static String chomp(final String str){
|
||||
if (str.length() ==0) {
|
||||
return str;
|
||||
}
|
||||
|
||||
if (str.length() == 1) {
|
||||
final char ch = str.charAt(0);
|
||||
if (ch == '\r' || ch == '\n') {
|
||||
return "";
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
int lastIdx = str.length() - 1;
|
||||
final char last = str.charAt(lastIdx);
|
||||
|
||||
if (last == '\n') {
|
||||
if (str.charAt(lastIdx - 1) == '\r') {
|
||||
lastIdx--;
|
||||
}
|
||||
} else if (last != '\r') {
|
||||
lastIdx++;
|
||||
}
|
||||
return str.substring(0, lastIdx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the first newline (if present) of a string
|
||||
* @param str
|
||||
* @return
|
||||
*/
|
||||
static String lchomp(final String str){
|
||||
if (str == null) return null;
|
||||
if (str.length() == 0) {
|
||||
return str;
|
||||
}
|
||||
|
||||
if (str.length() == 1) {
|
||||
final char ch = str.charAt(0);
|
||||
if (ch == '\r' || ch == '\n') {
|
||||
return "";
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
int firstIndex = 0;
|
||||
|
||||
final char first = str.charAt(0);
|
||||
if (first == '\n'){
|
||||
firstIndex++;
|
||||
if (str.charAt(1) == '\r') {
|
||||
firstIndex++ ;
|
||||
}
|
||||
} else if (first != '\r') {
|
||||
firstIndex = 0;
|
||||
}
|
||||
return str.substring(firstIndex, str.length());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reads content from the specified URL with specified charset into string
|
||||
* @param url
|
||||
* @param charset
|
||||
* @throws IOException
|
||||
*/
|
||||
@Deprecated // Removing network I/O will make htmlcleaner better suited to a server environment which needs managed connections
|
||||
static CharSequence readUrl(URL url, String charset) throws IOException {
|
||||
StringBuilder buffer = new StringBuilder(1024);
|
||||
InputStream inputStream = url.openStream();
|
||||
try {
|
||||
InputStreamReader reader = new InputStreamReader(inputStream, charset);
|
||||
char[] charArray = new char[1024];
|
||||
|
||||
int charsRead = 0;
|
||||
do {
|
||||
charsRead = reader.read(charArray);
|
||||
if (charsRead >= 0) {
|
||||
buffer.append(charArray, 0, charsRead);
|
||||
}
|
||||
} while (charsRead > 0);
|
||||
} finally {
|
||||
inputStream.close();
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if specified link is full URL.
|
||||
*
|
||||
* @param link
|
||||
* @return True, if full URl, false otherwise.
|
||||
*/
|
||||
public static boolean isFullUrl(String link) {
|
||||
if (link == null) {
|
||||
return false;
|
||||
}
|
||||
link = link.trim().toLowerCase();
|
||||
return link.startsWith("http://") || link.startsWith("https://") || link.startsWith("file://");
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates full URL for specified page URL and link
|
||||
* which could be full, absolute or relative like there can
|
||||
* be found in A or IMG tags. (Reinstated as per user request in bug 159)
|
||||
*/
|
||||
public static String fullUrl(String pageUrl, String link) {
|
||||
if (isFullUrl(link)) {
|
||||
return link;
|
||||
} else if (link != null && link.startsWith("?")) {
|
||||
int qindex = pageUrl.indexOf('?');
|
||||
int len = pageUrl.length();
|
||||
if (qindex < 0) {
|
||||
return pageUrl + link;
|
||||
} else if (qindex == len - 1) {
|
||||
return pageUrl.substring(0, len - 1) + link;
|
||||
} else {
|
||||
return pageUrl + "&" + link.substring(1);
|
||||
}
|
||||
}
|
||||
|
||||
boolean isLinkAbsolute = link.startsWith("/");
|
||||
|
||||
if (!isFullUrl(pageUrl)) {
|
||||
pageUrl = "http://" + pageUrl;
|
||||
}
|
||||
|
||||
int slashIndex = isLinkAbsolute ? pageUrl.indexOf("/", 8) : pageUrl.lastIndexOf("/");
|
||||
if (slashIndex <= 8) {
|
||||
pageUrl += "/";
|
||||
} else {
|
||||
pageUrl = pageUrl.substring(0, slashIndex + 1);
|
||||
}
|
||||
|
||||
return isLinkAbsolute ? pageUrl + link.substring(1) : pageUrl + link;
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes HTML string
|
||||
* @param s String to be escaped
|
||||
* @param props Cleaner properties affects escaping behaviour
|
||||
* @return the escaped string
|
||||
*/
|
||||
public static String escapeHtml(String s, CleanerProperties props) {
|
||||
boolean advanced = props.isAdvancedXmlEscape();
|
||||
boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
|
||||
boolean translateSpecialEntities = props.isTranslateSpecialEntities();
|
||||
boolean transResCharsToNCR = props.isTransResCharsToNCR();
|
||||
boolean transSpecialEntitiesToNCR = props.isTransSpecialEntitiesToNCR();
|
||||
return escapeXml(s, advanced, recognizeUnicodeChars, translateSpecialEntities, false, transResCharsToNCR, transSpecialEntitiesToNCR, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes XML string.
|
||||
* @param s String to be escaped
|
||||
* @param props Cleaner properties affects escaping behaviour
|
||||
* @param isDomCreation Tells if escaped content will be part of the DOM
|
||||
* @return the escaped string
|
||||
*/
|
||||
public static String escapeXml(String s, CleanerProperties props, boolean isDomCreation) {
|
||||
boolean advanced = props.isAdvancedXmlEscape();
|
||||
boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
|
||||
boolean translateSpecialEntities = props.isTranslateSpecialEntities();
|
||||
boolean transResCharsToNCR = props.isTransResCharsToNCR();
|
||||
boolean transSpecialEntitiesToNCR = props.isTransSpecialEntitiesToNCR();
|
||||
return escapeXml(s, advanced, recognizeUnicodeChars, translateSpecialEntities, isDomCreation, transResCharsToNCR, transSpecialEntitiesToNCR, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* change notes:
|
||||
* 1) convert ascii characters encoded using &#xx; format to the ascii characters -- may be an attempt to slip in malicious html
|
||||
* 2) convert &#xxx; format characters to " style representation if available for the character.
|
||||
* 3) convert html special entities to xml &#xxx; when outputing in xml
|
||||
* @param s the string to escape
|
||||
* @param advanced whether to use Advanced XML Escaping
|
||||
* @param recognizeUnicodeChars whether to recognise and replace Unicode characters
|
||||
* @param translateSpecialEntities whether to translate special entities
|
||||
* @param isDomCreation whether the escaping is in the context of DomCreation, an internal operation, with special rules.
|
||||
* @return the escaped string
|
||||
* TODO Consider moving to CleanerProperties since a long list of params is misleading.
|
||||
*/
|
||||
public static String escapeXml(String s, boolean advanced, boolean recognizeUnicodeChars, boolean translateSpecialEntities,
|
||||
boolean isDomCreation, boolean transResCharsToNCR, boolean translateSpecialEntitiesToNCR) {
|
||||
return escapeXml(s,advanced,recognizeUnicodeChars,translateSpecialEntities,isDomCreation,transResCharsToNCR,translateSpecialEntitiesToNCR,false);
|
||||
}
|
||||
|
||||
/**
|
||||
* change notes:
|
||||
* 1) convert ascii characters encoded using &#xx; format to the ascii characters -- may be an attempt to slip in malicious html
|
||||
* 2) convert &#xxx; format characters to " style representation if available for the character.
|
||||
* 3) convert html special entities to xml &#xxx; when outputing in xml
|
||||
* @param s the string to escape
|
||||
* @param advanced whether to use Advanced XML Escaping
|
||||
* @param recognizeUnicodeChars whether to recognise and replace Unicode characters
|
||||
* @param translateSpecialEntities whether to translate special entities
|
||||
* @param isDomCreation whether the escaping is in the context of DomCreation, an internal operation, with special rules.
|
||||
* @param isHtmlOutput whether the output is intended to be treated as HTML
|
||||
* @return
|
||||
* TODO Consider moving to CleanerProperties since a long list of params is misleading.
|
||||
*/
|
||||
public static String escapeXml(String s, boolean advanced, boolean recognizeUnicodeChars, boolean translateSpecialEntities,
|
||||
boolean isDomCreation, boolean transResCharsToNCR, boolean translateSpecialEntitiesToNCR, boolean isHtmlOutput) {
|
||||
if (s != null) {
|
||||
int len = s.length();
|
||||
StringBuilder result = new StringBuilder(len);
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
char ch = s.charAt(i);
|
||||
|
||||
SpecialEntity code;
|
||||
if (ch == '&') {
|
||||
if ( (advanced || recognizeUnicodeChars) && (i < len-1) && (s.charAt(i+1) == '#') ) {
|
||||
|
||||
i = convertToUnicode(s, isDomCreation, recognizeUnicodeChars, translateSpecialEntitiesToNCR, result, i+2);
|
||||
} else if ((translateSpecialEntities || advanced) &&
|
||||
(code = SpecialEntities.INSTANCE.getSpecialEntity(s.substring(i, i+Math.min(10, len-i)))) != null) {
|
||||
if (translateSpecialEntities && code.isHtmlSpecialEntity()) {
|
||||
if (recognizeUnicodeChars) {
|
||||
result.append( (char)code.intValue() );
|
||||
} else {
|
||||
result.append( code.getDecimalNCR() );
|
||||
}
|
||||
i += code.getKey().length() + 1;
|
||||
} else if (advanced ) {
|
||||
//
|
||||
// If we are creating a HTML DOM or outputting to the HtmlSerializer, use HTML special entities;
|
||||
// otherwise we get their XML escaped version (see bug #118).
|
||||
//
|
||||
result.append(transResCharsToNCR ? code.getDecimalNCR() : code.getEscaped(isHtmlOutput || isDomCreation));
|
||||
i += code.getKey().length()+1;
|
||||
} else {
|
||||
result.append(transResCharsToNCR ? getAmpNcr() : "&");
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// If the serializer used to output is HTML rather than XML, and we have a match to a
|
||||
// known HTML entity such as , we output it as-is (see bug #118)
|
||||
//
|
||||
|
||||
else if (isHtmlOutput)
|
||||
{
|
||||
// we have an ampersand and that's all we know so far
|
||||
|
||||
code = SpecialEntities.INSTANCE.getSpecialEntity(s.substring(i, i+Math.min(10, len-i)));
|
||||
|
||||
if ( code != null )
|
||||
{
|
||||
// It is a special entity like - leave it in place.
|
||||
|
||||
result.append(code.getEscapedValue());
|
||||
|
||||
// advance i by the length of the entity so we won't process each following character
|
||||
// key length excludes & and ; and we add 1 to skip the ;
|
||||
i += code.getKey().length()+1;
|
||||
}
|
||||
else if ( (i < len-1) && (s.charAt(i+1) == '#') )
|
||||
{
|
||||
// if the next char is a # then convert entity number to entity name (if possible)
|
||||
|
||||
i = convert_To_Entity_Name(s, false, false, false, result, i+2);
|
||||
|
||||
// assuming 'i' is being incremented correctly... not verified.
|
||||
}
|
||||
else
|
||||
{
|
||||
// html output but not an entity name or number
|
||||
|
||||
result.append(transResCharsToNCR ? getAmpNcr() : "&");
|
||||
}
|
||||
} else {
|
||||
result.append(transResCharsToNCR ? getAmpNcr() : "&");
|
||||
}
|
||||
} else if ((code = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(ch)) != null ) {
|
||||
|
||||
// It's a special entity character itself
|
||||
|
||||
if ( isHtmlOutput )
|
||||
{
|
||||
if ( "apos".equals(code.getKey()) )
|
||||
{
|
||||
// leave the apostrophes alone for html output
|
||||
// this is a cheap hack to avoid removing apostrophe from the special entities list for html output
|
||||
result.append(ch);
|
||||
}
|
||||
else
|
||||
{
|
||||
// output as entity name, or as literal character if isDomCreation
|
||||
result.append(isDomCreation? code.getHtmlString() : code.getEscapedValue());
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// if we have one of the XML reserved characters, get escaped version, otherwise,
|
||||
// output the literal characters.
|
||||
if (isDomCreation && !isXmlReservedCharacter(String.valueOf(ch))){
|
||||
result.append(ch);
|
||||
} else {
|
||||
// output as entity number, or as literal character if isDomCreation
|
||||
result.append(transResCharsToNCR ? code.getDecimalNCR() : code.getEscaped(isDomCreation));
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private static String ampNcr;
|
||||
|
||||
private static String getAmpNcr() {
|
||||
if (ampNcr == null) {
|
||||
ampNcr = SpecialEntities.INSTANCE.getSpecialEntityByUnicode('&').getDecimalNCR();
|
||||
}
|
||||
|
||||
return ampNcr;
|
||||
}
|
||||
|
||||
private static final Pattern ASCII_CHAR = Pattern.compile("\\p{Print}");
|
||||
|
||||
/**
|
||||
* @param s
|
||||
* @param domCreation
|
||||
* @param recognizeUnicodeChars
|
||||
* @param translateSpecialEntitiesToNCR
|
||||
* @param result
|
||||
* @param i
|
||||
* @return
|
||||
*/
|
||||
|
||||
// Converts Numeric Character References (NCRs) (Dec or Hex) to Character Entity References
|
||||
// ie. € to €
|
||||
// This is almost a copy of convertToUnicode
|
||||
// only called in the case of isHtmlOutput when we see &# in the input stream
|
||||
|
||||
private static int convert_To_Entity_Name(String s, boolean domCreation, boolean recognizeUnicodeChars, boolean translateSpecialEntitiesToNCR, StringBuilder result, int i) {
|
||||
StringBuilder unicode = new StringBuilder();
|
||||
int charIndex = extractCharCode(s, i, true, unicode);
|
||||
if (unicode.length() > 0) {
|
||||
try {
|
||||
boolean isHex = unicode.substring(0,1).equals("x");
|
||||
|
||||
//
|
||||
// Get the unicode character and code point
|
||||
//
|
||||
int codePoint = -1;
|
||||
char[] unicodeChar = null;
|
||||
if (isHex){
|
||||
codePoint = Integer.parseInt(unicode.substring(1), 16);
|
||||
unicodeChar = Character.toChars(codePoint);
|
||||
} else {
|
||||
codePoint = Integer.parseInt(unicode.toString());
|
||||
unicodeChar = Character.toChars(codePoint);
|
||||
}
|
||||
|
||||
SpecialEntity specialEntity = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(codePoint);
|
||||
if (unicodeChar.length == 1 && unicodeChar[0] == 0) {
|
||||
// null character �Peanut for example
|
||||
// just consume character &
|
||||
result.append("&");
|
||||
}
|
||||
else if ( specialEntity != null )
|
||||
{
|
||||
if ( specialEntity.isHtmlSpecialEntity() )
|
||||
{
|
||||
result.append( domCreation? specialEntity.getHtmlString() : specialEntity.getEscapedValue() );
|
||||
}
|
||||
else
|
||||
{
|
||||
result.append(domCreation? specialEntity.getHtmlString():
|
||||
(translateSpecialEntitiesToNCR? (isHex? specialEntity.getHexNCR(): specialEntity.getDecimalNCR()) :
|
||||
specialEntity.getHtmlString()));
|
||||
}
|
||||
} else if ( recognizeUnicodeChars ) {
|
||||
// output unicode characters as their actual byte code with the exception of characters that have special xml meaning.
|
||||
result.append( String.valueOf(unicodeChar));
|
||||
} else if ( ASCII_CHAR.matcher(new String(unicodeChar)).find()) {
|
||||
// ascii printable character. this fancy escaping might be an attempt to slip in dangerous characters (i.e. spelling out <script> )
|
||||
// by converting to printable characters we can more easily detect such attacks.
|
||||
result.append(String.valueOf(unicodeChar));
|
||||
} else {
|
||||
// unknown unicode value - output as-is
|
||||
result.append( "&#").append(unicode).append(";" );
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
// should never happen now
|
||||
result.append("&#").append(unicode).append(";" );
|
||||
}
|
||||
} else {
|
||||
result.append("&");
|
||||
}
|
||||
return charIndex;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @param s
|
||||
* @param domCreation
|
||||
* @param recognizeUnicodeChars
|
||||
* @param translateSpecialEntitiesToNCR
|
||||
* @param result
|
||||
* @param i
|
||||
* @return
|
||||
*/
|
||||
private static int convertToUnicode(String s, boolean domCreation, boolean recognizeUnicodeChars, boolean translateSpecialEntitiesToNCR, StringBuilder result, int i) {
|
||||
StringBuilder unicode = new StringBuilder();
|
||||
int charIndex = extractCharCode(s, i, true, unicode);
|
||||
if (unicode.length() > 0) {
|
||||
try {
|
||||
boolean isHex = unicode.substring(0,1).equals("x");
|
||||
|
||||
//
|
||||
// Get the unicode character and code point
|
||||
//
|
||||
int codePoint = -1;
|
||||
char[] unicodeChar = null;
|
||||
if (isHex){
|
||||
codePoint = Integer.parseInt(unicode.substring(1), 16);
|
||||
} else {
|
||||
codePoint = Integer.parseInt(unicode.toString());
|
||||
}
|
||||
|
||||
unicodeChar = Character.toChars(codePoint);
|
||||
|
||||
SpecialEntity specialEntity = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(codePoint);
|
||||
if (unicodeChar.length == 1 && unicodeChar[0] == 0) {
|
||||
// null character �Peanut for example
|
||||
// just consume character &
|
||||
result.append("&");
|
||||
} else if ( specialEntity != null &&
|
||||
// special characters that are always escaped.
|
||||
(!specialEntity.isHtmlSpecialEntity()
|
||||
// OR we are not outputting unicode characters as the characters ( they are staying escaped )
|
||||
|| !recognizeUnicodeChars)) {
|
||||
result.append(domCreation? specialEntity.getHtmlString():
|
||||
(translateSpecialEntitiesToNCR? (isHex? specialEntity.getHexNCR(): specialEntity.getDecimalNCR()) :
|
||||
specialEntity.getEscapedXmlString()));
|
||||
} else if ( recognizeUnicodeChars ) {
|
||||
// output unicode characters as their actual byte code with the exception of characters that have special xml meaning.
|
||||
result.append( String.valueOf(unicodeChar));
|
||||
} else if ( ASCII_CHAR.matcher(new String(unicodeChar)).find()) {
|
||||
// ascii printable character. this fancy escaping might be an attempt to slip in dangerous characters (i.e. spelling out <script> )
|
||||
// by converting to printable characters we can more easily detect such attacks.
|
||||
result.append(String.valueOf(unicodeChar));
|
||||
} else {
|
||||
result.append( "&#").append(unicode).append(";" );
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
// should never happen now
|
||||
result.append("&#").append(unicode).append(";" );
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
// code point is not a legal unicode character
|
||||
result.append("&#").append(unicode).append(";" );
|
||||
}
|
||||
} else {
|
||||
result.append("&");
|
||||
}
|
||||
return charIndex;
|
||||
}
|
||||
|
||||
// TODO have pattern consume leading 0's and discard.
|
||||
public static Pattern HEX_STRICT = Pattern.compile("^([x|X][\\p{XDigit}]+)(;?)");
|
||||
public static Pattern HEX_RELAXED = Pattern.compile("^0*([x|X][\\p{XDigit}]+)(;?)");
|
||||
public static Pattern DECIMAL = Pattern.compile("^([\\p{Digit}]+)(;?)");
|
||||
/**
|
||||
* <ul>
|
||||
* <li>(earlier code was failing on this) - ŠA; is converted by FF to 3 characters: Š + 'A' + ';'</li>
|
||||
* <li>�x138A; is converted by FF to 6? 7? characters: � 'x'+'1'+'3'+ '8' + 'A' + ';'
|
||||
* #0 is displayed kind of weird</li>
|
||||
* <li>ᎊ is a single character</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param s
|
||||
* @param charIndex
|
||||
* @param relaxedUnicode '�x138;' is treated like 'ĸ'
|
||||
* @param unicode
|
||||
* @return the index to continue scanning the source string -1 so normal loop incrementing skips the ';'
|
||||
*/
|
||||
private static int extractCharCode(String s, int charIndex, boolean relaxedUnicode, StringBuilder unicode) {
|
||||
int len = s.length();
|
||||
CharSequence subSequence = s.subSequence(charIndex, Math.min(len,charIndex+15));
|
||||
Matcher matcher;
|
||||
if( relaxedUnicode ) {
|
||||
matcher = HEX_RELAXED.matcher(subSequence);
|
||||
} else {
|
||||
matcher = HEX_STRICT.matcher(subSequence);
|
||||
}
|
||||
// silly note: remember calling find() twice finds second match :-)
|
||||
if (matcher.find() || ((matcher = DECIMAL.matcher(subSequence)).find())) {
|
||||
// -1 so normal loop incrementing skips the ';'
|
||||
charIndex += matcher.end() -1;
|
||||
unicode.append(matcher.group(1));
|
||||
}
|
||||
return charIndex;
|
||||
}
|
||||
|
||||
public static String sanitizeXmlIdentifier(String attName){
|
||||
return sanitizeXmlIdentifier(attName, "hc-generated-","");
|
||||
}
|
||||
|
||||
public static String sanitizeXmlIdentifier(String attName, String prefix){
|
||||
return sanitizeXmlIdentifier(attName, prefix,"");
|
||||
}
|
||||
|
||||
public static String sanitizeHtmlAttributeName(String name){
|
||||
// Attribute names must consist of one or more characters other than controls,
|
||||
// U+0020 SPACE, U+0022 ("), U+0027 ('), U+003E (>), U+002F (/), U+003D (=), and noncharacters.
|
||||
String regex = "[\\u0000\\u0020\\u0022\\u0027\\u003E\\u002F\\u003d]";
|
||||
Pattern pattern = compileUnicodePattern(regex);
|
||||
final Matcher matcher = pattern.matcher(name);
|
||||
name = matcher.replaceAll("");
|
||||
return name;
|
||||
}
|
||||
|
||||
public static boolean isValidHtmlAttributeName(String name){
|
||||
String regex = "^[^\\u0000\\u0020\\u0022\\u0027\\u003E\\u002F\\u003d]+$";
|
||||
Pattern pattern = compileUnicodePattern(regex);
|
||||
final Matcher matcher = pattern.matcher(name);
|
||||
return matcher.find();
|
||||
}
|
||||
|
||||
/**
|
||||
* Attempts to replace invalid attribute names with valid ones.
|
||||
* @param attName the attribute name to fix
|
||||
* @param prefix the prefix to use to indicate an attribute name has been altered
|
||||
* @return either the original attribute name if valid, or a generated identifier if not
|
||||
*/
|
||||
public static String sanitizeXmlIdentifier(String attName, String prefix, String replacementCharacter){
|
||||
if (Utils.isValidXmlIdentifier(attName)) return attName;
|
||||
|
||||
//
|
||||
// Prepend with "hc-generated-" or similar prefix. Useful for
|
||||
// identifiers that are valid apart from the start character, e.g "1a"
|
||||
//
|
||||
if (!Utils.isValidXmlIdentifierStartChar(attName.substring(0,1))){
|
||||
if (!prefix.isEmpty()){
|
||||
String generatedAttName = prefix + attName;
|
||||
if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
|
||||
} else {
|
||||
//
|
||||
// If not, strip out first character
|
||||
//
|
||||
String generatedAttName = attName.substring(1);
|
||||
if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// otherwise, replace or strip out invalid characters
|
||||
//
|
||||
String generatedAttName = Utils.replaceInvalidXmlIdentifierCharacters(attName,"");
|
||||
if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
|
||||
|
||||
//
|
||||
// If we still have something invalid - for example none of the characters in
|
||||
// it are valid - then return null
|
||||
//
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether specified string can be valid tag name or attribute name in xml.
|
||||
* @param s String to be checked
|
||||
* @return True if string is valid xml identifier, false otherwise
|
||||
*/
|
||||
public static boolean isValidXmlIdentifier(String s) {
|
||||
if (s == null) return false;
|
||||
Matcher matcher = VALID_XML_IDENTIFIER_CHAR_PATTERN.matcher(s);
|
||||
if (matcher.find()){
|
||||
s = null;
|
||||
matcher = null;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param o
|
||||
* @return True if specified string is null of contains only whitespace characters
|
||||
*/
|
||||
public static boolean isEmptyString(Object o) {
|
||||
if ( o == null ) {
|
||||
return true;
|
||||
}
|
||||
String s = o.toString();
|
||||
String text = escapeXml(s, true, false, false, false, false, false, false);
|
||||
// TODO: doesn't escapeXml handle this?
|
||||
String last = text.replace(SpecialEntities.NON_BREAKABLE_SPACE, ' ').trim();
|
||||
return last.length() == 0;
|
||||
}
|
||||
|
||||
public static String[] tokenize(String s, String delimiters) {
|
||||
if (s == null) {
|
||||
return new String[] {};
|
||||
}
|
||||
|
||||
StringTokenizer tokenizer = new StringTokenizer(s, delimiters);
|
||||
String result[] = new String[tokenizer.countTokens()];
|
||||
int index = 0;
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
result[index++] = tokenizer.nextToken();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static boolean isXmlReservedCharacter(String c){
|
||||
final String XML_CHARS="'\"<>&";
|
||||
return XML_CHARS.contains(c);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* @return For xml element name or attribute name returns prefix (part before :) or null if there is no prefix
|
||||
*/
|
||||
public static String getXmlNSPrefix(String name) {
|
||||
int colIndex = name.indexOf(':');
|
||||
if (colIndex > 0) {
|
||||
return name.substring(0, colIndex);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param name
|
||||
* @return For xml element name or attribute name returns name after prefix (part after :)
|
||||
*/
|
||||
public static String getXmlName(String name) {
|
||||
int colIndex = name.indexOf(':');
|
||||
if (colIndex > 0 && colIndex < name.length() - 1) {
|
||||
return name.substring(colIndex + 1);
|
||||
}
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
static boolean isValidInt(String s, int radix) {
|
||||
try {
|
||||
Integer.parseInt(s, radix);
|
||||
return true;
|
||||
} catch (NumberFormatException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims specified string from left.
|
||||
* @param s
|
||||
*/
|
||||
public static String ltrim(String s) {
|
||||
if (s == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int index = 0;
|
||||
int len = s.length();
|
||||
|
||||
while ( index < len && Character.isWhitespace(s.charAt(index)) ) {
|
||||
index++;
|
||||
}
|
||||
|
||||
return (index >= len) ? "" : s.substring(index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Trims specified string from right.
|
||||
* @param s
|
||||
*/
|
||||
public static String rtrim(String s) {
|
||||
if (s == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int len = s.length();
|
||||
int index = len;
|
||||
|
||||
while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) {
|
||||
index--;
|
||||
}
|
||||
|
||||
return (index <= 0) ? "" : s.substring(0, index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether specified object's string representation is empty string (containing of only whitespaces).
|
||||
* @param object Object whose string representation is checked
|
||||
* @return true, if empty string, false otherwise
|
||||
*/
|
||||
public static boolean isWhitespaceString(Object object) {
|
||||
if (object != null) {
|
||||
String s = object.toString();
|
||||
return s != null && "".equals(s.trim());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// Replaces entities with actual characters
|
||||
//
|
||||
public static String deserializeEntities(String str, boolean recognizeUnicodeChars) {
|
||||
StringBuffer buf = new StringBuffer(str);
|
||||
SpecialEntities entities = SpecialEntities.INSTANCE;
|
||||
int entityStart = -1;
|
||||
boolean numericEntity = false;
|
||||
boolean hexEntity = false;
|
||||
int maxEntityLength = entities.getMaxEntityLength();
|
||||
int i = 0;
|
||||
int length = buf.length();
|
||||
while (i < length) {
|
||||
if (buf.charAt(i) == '&') {
|
||||
entityStart = i;
|
||||
numericEntity = false;
|
||||
hexEntity = false;
|
||||
++i;
|
||||
} else if (entityStart != -1) {
|
||||
if (buf.charAt(i) == ';') {
|
||||
int entityValue = -1;
|
||||
if (numericEntity) {
|
||||
try {
|
||||
entityValue = Integer.parseInt(
|
||||
buf.substring(
|
||||
entityStart + (hexEntity ? 3 : 2),
|
||||
i
|
||||
),
|
||||
hexEntity ? 16 : 10
|
||||
);
|
||||
} catch (NumberFormatException e) {
|
||||
entityValue = -1;
|
||||
}
|
||||
|
||||
SpecialEntity entity = entities.getSpecialEntityByUnicode(entityValue);
|
||||
if(entity != null)
|
||||
entityValue = entity.intValue();
|
||||
else if(!recognizeUnicodeChars)
|
||||
entityValue = -1;
|
||||
} else {
|
||||
SpecialEntity entity = entities.getSpecialEntity(buf.substring(entityStart + 1, i));
|
||||
if(entity != null)
|
||||
entityValue = entity.intValue();
|
||||
}
|
||||
|
||||
if (entityValue >= 0) {
|
||||
char[] decodedEntity = Character.toChars(entityValue);
|
||||
buf.replace(entityStart, i + 1, new String(decodedEntity));
|
||||
length = buf.length();
|
||||
i = entityStart + decodedEntity.length;
|
||||
} else {
|
||||
++i;
|
||||
}
|
||||
entityStart = -1;
|
||||
} else {
|
||||
if (i == entityStart + 1 && buf.charAt(i) == '#') {
|
||||
numericEntity = true;
|
||||
} else if (i == entityStart + 2 && numericEntity && buf.charAt(i) == 'x') {
|
||||
hexEntity = true;
|
||||
} else if (i - entityStart > maxEntityLength) {
|
||||
entityStart = -1;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
} else {
|
||||
++i;
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the initial character of an identifier is valid for XML
|
||||
* @param identifier the identifier to check
|
||||
* @return true is the intial character is valid
|
||||
*/
|
||||
public static boolean isValidXmlIdentifierStartChar(String identifier){
|
||||
final Matcher matcher = VALID_XML_IDENTIFIER_START_CHAR_PATTERN.matcher(identifier);
|
||||
return matcher.find();
|
||||
}
|
||||
|
||||
/**
|
||||
* Strips out invalid characters from names used for XML Elements and replaces them with the specified
|
||||
* character.
|
||||
*
|
||||
* For example, "<p%>" becomes "<p_>"
|
||||
* @param name
|
||||
* @return valid XML name
|
||||
*/
|
||||
public static String replaceInvalidXmlIdentifierCharacters(String name, String replacement){
|
||||
final String regex_repl = ""
|
||||
+ "[^:A-Z_a-z\\u00C0\\u00D6\\u00D8-\\u00F6"
|
||||
+ "\\u00F8-\\u02ff\\u0370-\\u037d\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f"
|
||||
+ "\\u2c00-\\u2fef\\u3001-\\udfff\\uf900-\\ufdcf\\ufdf0-\\ufffd\\-\\.0-9"
|
||||
+ "\\u00b7\\u0300-\\u036f\\u203f-\\u2040]";
|
||||
final Pattern pattern = compileUnicodePattern(regex_repl);
|
||||
final Matcher matcher = pattern.matcher(name);
|
||||
name = matcher.replaceAll(replacement);
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
|
||||
private static Pattern compileUnicodePattern(String pattern){
|
||||
try {
|
||||
return Pattern.compile(pattern, Pattern.UNICODE_CHARACTER_CLASS);
|
||||
} catch(IllegalArgumentException ex) {
|
||||
return Pattern.compile(pattern);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
@@ -0,0 +1,612 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
/**
|
||||
* <p>Utility for searching cleaned document tree with XPath expressions.</p>
|
||||
* Examples of supported axes:
|
||||
* <code>
|
||||
* <ul>
|
||||
* <li>//div//a</li>
|
||||
* <li>//div//a[@id][@class]</li>
|
||||
* <li>/body/*[1]/@type</li>
|
||||
* <li>//div[3]//a[@id][@href='r/n4']</li>
|
||||
* <li>//div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a</li>
|
||||
* <li>//div[2]/@*[2]</li>
|
||||
* <li>data(//div//a[@id][@class])</li>
|
||||
* <li>//p/last()</li>
|
||||
* <li>//body//div[3][@class]//span[12.2<position()]/@id</li>
|
||||
* <li>data(//a['v' < @id])</li>
|
||||
* </ul>
|
||||
* </code>
|
||||
*/
|
||||
public class XPather {
|
||||
|
||||
private static final int C0 = '0';
|
||||
private static final int C9 = '9';
|
||||
private static final int CD = '.';
|
||||
private static final int CP = '+';
|
||||
private static final int CM = '-';
|
||||
private static final int CS = ' ';
|
||||
|
||||
// array of basic tokens of which XPath expression is made
|
||||
private String tokenArray[];
|
||||
|
||||
/**
|
||||
* Constructor - creates XPather instance with specified XPath expression.
|
||||
* @param expression
|
||||
*/
|
||||
public XPather(String expression) {
|
||||
StringTokenizer tokenizer = new StringTokenizer(expression, "/()[]\"'=<>", true);
|
||||
int tokenCount = tokenizer.countTokens();
|
||||
tokenArray = new String[tokenCount];
|
||||
|
||||
int index = 0;
|
||||
|
||||
// this is not real XPath compiler, rather simple way to recognize basic XPaths expressions
|
||||
// and interpret them against some TagNode instance.
|
||||
while (tokenizer.hasMoreTokens()) {
|
||||
tokenArray[index++] = tokenizer.nextToken();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main public method for this class - a way to execute XPath expression against
|
||||
* specified TagNode instance.
|
||||
* @param node
|
||||
*/
|
||||
public Object[] evaluateAgainstNode(TagNode node) throws XPatherException {
|
||||
if (node == null) {
|
||||
throw new XPatherException("Cannot evaluate XPath expression against null value!");
|
||||
}
|
||||
|
||||
Collection collectionResult = evaluateAgainst(singleton(node), 0, tokenArray.length - 1, false, 1, 0, false, null);
|
||||
Object[] array = new Object[collectionResult.size()];
|
||||
|
||||
Iterator iterator = collectionResult.iterator();
|
||||
int index = 0;
|
||||
while (iterator.hasNext()) {
|
||||
array[index++] = iterator.next();
|
||||
}
|
||||
|
||||
return array;
|
||||
}
|
||||
|
||||
private void throwStandardException() throws XPatherException {
|
||||
throw new XPatherException();
|
||||
}
|
||||
|
||||
protected Collection evaluateAgainst(Collection object,
|
||||
int from,
|
||||
int to,
|
||||
boolean isRecursive,
|
||||
int position,
|
||||
int last,
|
||||
boolean isFilterContext,
|
||||
Collection filterSource) throws XPatherException {
|
||||
if (from >= 0 && to < tokenArray.length && from <= to) {
|
||||
if ("".equals(tokenArray[from].trim())) {
|
||||
return evaluateAgainst(object, from + 1, to, isRecursive, position, last, isFilterContext, filterSource);
|
||||
} else if (isToken("(", from)) {
|
||||
int closingBracket = findClosingIndex(from, to);
|
||||
if (closingBracket > 0) {
|
||||
Collection value = evaluateAgainst(object, from + 1, closingBracket - 1, false, position, last, isFilterContext, filterSource);
|
||||
return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, filterSource);
|
||||
} else {
|
||||
throwStandardException();
|
||||
}
|
||||
} else if (isToken("[", from)) {
|
||||
int closingBracket = findClosingIndex(from, to);
|
||||
if (closingBracket > 0 && object != null) {
|
||||
Collection value = filterByCondition(object, from + 1, closingBracket - 1);
|
||||
return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, filterSource);
|
||||
} else {
|
||||
throwStandardException();
|
||||
}
|
||||
} else if (isToken("\"", from) || isToken("'", from)) { // string constant
|
||||
int closingQuote = findClosingIndex(from, to);
|
||||
if (closingQuote > from) {
|
||||
Collection value = singleton( flatten(from + 1, closingQuote - 1) );
|
||||
return evaluateAgainst(value, closingQuote + 1, to, false, position, last, isFilterContext, filterSource);
|
||||
} else {
|
||||
throwStandardException();
|
||||
}
|
||||
} else if ( (isToken("=", from) || isToken("<", from) || isToken(">", from)) && isFilterContext ) { // operator inside filter
|
||||
boolean logicValue;
|
||||
if ( isToken("=", from + 1) && (isToken("<", from) || isToken(">", from)) ) {
|
||||
Collection secondObject = evaluateAgainst(filterSource, from + 2, to, false, position, last, isFilterContext, filterSource);
|
||||
logicValue = evaluateLogic(object, secondObject, tokenArray[from] + tokenArray[from + 1]);
|
||||
} else {
|
||||
Collection secondObject = evaluateAgainst(filterSource, from + 1, to, false, position, last, isFilterContext, filterSource);
|
||||
logicValue = evaluateLogic(object, secondObject, tokenArray[from]);
|
||||
}
|
||||
return singleton(new Boolean(logicValue));
|
||||
} else if (isToken("/", from)) { // children of the node
|
||||
boolean goRecursive = isToken("/", from + 1);
|
||||
if (goRecursive) {
|
||||
from++;
|
||||
}
|
||||
if ( from < to ) {
|
||||
int toIndex = findClosingIndex(from, to) - 1;
|
||||
if (toIndex <= from) {
|
||||
toIndex = to;
|
||||
}
|
||||
Collection value = evaluateAgainst(object, from + 1, toIndex, goRecursive, 1, last, isFilterContext, filterSource);
|
||||
return evaluateAgainst(value, toIndex + 1, to, false, 1, last, isFilterContext, filterSource);
|
||||
} else {
|
||||
throwStandardException();
|
||||
}
|
||||
} else if (isFunctionCall(from, to)) {
|
||||
int closingBracketIndex = findClosingIndex(from + 1, to);
|
||||
Collection funcValue = evaluateFunction(object, from, to, position, last, isFilterContext);
|
||||
return evaluateAgainst(funcValue, closingBracketIndex + 1, to, false, 1, last, isFilterContext, filterSource);
|
||||
} else if (isValidInteger(tokenArray[from])) {
|
||||
Collection value = singleton(Integer.valueOf(tokenArray[from]));
|
||||
return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
|
||||
} else if (isValidDouble(tokenArray[from])) {
|
||||
Collection value = singleton(Double.valueOf(tokenArray[from]));
|
||||
return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
|
||||
} else {
|
||||
return getElementsByName(object, from, to, isRecursive, isFilterContext);
|
||||
}
|
||||
} else {
|
||||
return object;
|
||||
}
|
||||
|
||||
throw new XPatherException();
|
||||
}
|
||||
|
||||
private String flatten(int from, int to) {
|
||||
if (from <= to) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
for (int i = from; i <= to; i++) {
|
||||
result.append(tokenArray[i]);
|
||||
}
|
||||
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private static boolean isValidInteger(String value) {
|
||||
final int l = value.length();
|
||||
if(l > 0) {
|
||||
int i = 1, c = value.charAt(0);
|
||||
if(c == CP || c == CM || (c >= C0 && c <= C9)) {
|
||||
for (; i < l; i++) {
|
||||
c = value.charAt(i);
|
||||
if (c < C0 || c > C9)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isValidDouble(String value) {
|
||||
final int l = value.length();
|
||||
if(l > 0) {
|
||||
int i = 1, c = value.charAt(0);
|
||||
if(c == CP || c == CM || c == CS || (c >= C0 && c <= C9)) {
|
||||
for (; i < l; i++) {
|
||||
c = value.charAt(i);
|
||||
if (c != CD && (c < C0 || c > C9))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if given string is valid identifier.
|
||||
* @param s
|
||||
*/
|
||||
private boolean isIdentifier(String s) {
|
||||
if (s == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
s = s.trim();
|
||||
if (s.length() > 0) {
|
||||
if ( !Character.isLetter(s.charAt(0)) ) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 1; i < s.length(); i++) {
|
||||
final char ch = s.charAt(i);
|
||||
if ( ch != '_' && ch != '-' && !Character.isLetterOrDigit(ch) ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if tokens in specified range represents valid function call.
|
||||
* @param from
|
||||
* @param to
|
||||
* @return True if it is valid function call, false otherwise.
|
||||
*/
|
||||
private boolean isFunctionCall(int from, int to) {
|
||||
if ( !isIdentifier(tokenArray[from]) && !isToken("(", from + 1) ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return findClosingIndex(from + 1, to) > from + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates specified function.
|
||||
* Currently, following XPath functions are supported: last, position, text, count, data
|
||||
* @param source
|
||||
* @param from
|
||||
* @param to
|
||||
* @param position
|
||||
* @param last
|
||||
* @return Collection as the result of evaluation.
|
||||
*/
|
||||
protected Collection evaluateFunction(Collection source,
|
||||
int from,
|
||||
int to,
|
||||
int position,
|
||||
int last,
|
||||
boolean isFilterContext) throws XPatherException {
|
||||
String name = tokenArray[from].trim();
|
||||
ArrayList result = new ArrayList();
|
||||
|
||||
final int size = source.size();
|
||||
Iterator iterator = source.iterator();
|
||||
int index = 0;
|
||||
while (iterator.hasNext()) {
|
||||
Object curr = iterator.next();
|
||||
index++;
|
||||
if ( "last".equals(name) ) {
|
||||
result.add( Integer.valueOf(isFilterContext ? last : size) );
|
||||
} else if ( "position".equals(name) ) {
|
||||
result.add( Integer.valueOf(isFilterContext ? position : index) );
|
||||
} else if ( "text".equals(name) ) {
|
||||
if (curr instanceof TagNode) {
|
||||
result.add( ((TagNode)curr).getText() );
|
||||
} else if (curr instanceof String) {
|
||||
result.add( curr.toString() );
|
||||
}
|
||||
} else if ( "count".equals(name) ) {
|
||||
Collection argumentEvaluated =
|
||||
evaluateAgainst(source, from + 2, to - 1, false, position, 0, isFilterContext, null);
|
||||
result.add( Integer.valueOf(argumentEvaluated.size()) );
|
||||
} else if ( "data".equals(name) ) {
|
||||
Collection argumentEvaluated = evaluateAgainst(source, from + 2, to - 1, false, position, 0, isFilterContext, null);
|
||||
Iterator it = argumentEvaluated.iterator();
|
||||
while (it.hasNext()) {
|
||||
Object elem = it.next();
|
||||
if (elem instanceof TagNode) {
|
||||
result.add( ((TagNode)elem).getText() );
|
||||
} else if (elem instanceof String) {
|
||||
result.add( elem.toString() );
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new XPatherException("Unknown function " + name + "!");
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter nodes satisfying the condition
|
||||
* @param source
|
||||
* @param from
|
||||
* @param to
|
||||
*/
|
||||
protected Collection filterByCondition(Collection source, int from, int to) throws XPatherException {
|
||||
ArrayList result = new ArrayList();
|
||||
Iterator iterator = source.iterator();
|
||||
int index = 0;
|
||||
int size = source.size();
|
||||
while (iterator.hasNext()) {
|
||||
Object curr = iterator.next();
|
||||
index++;
|
||||
|
||||
ArrayList logicValueList = new ArrayList(evaluateAgainst(singleton(curr), from, to, false, index, size, true, singleton(curr)));
|
||||
if (logicValueList.size() >= 1) {
|
||||
Object first = logicValueList.get(0);
|
||||
if (first instanceof Boolean) {
|
||||
if ( ((Boolean)first).booleanValue() ) {
|
||||
result.add(curr);
|
||||
}
|
||||
} else if (first instanceof Integer) {
|
||||
if ( ((Integer)first).intValue() == index ) {
|
||||
result.add(curr);
|
||||
}
|
||||
} else {
|
||||
result.add(curr);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private boolean isToken(String token, int index) {
|
||||
int len = tokenArray.length;
|
||||
return index >= 0 && index < len && tokenArray[index].trim().equals(token.trim());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param from
|
||||
* @param to
|
||||
* @return matching closing index in the token array for the current token, or -1 if there is
|
||||
* no closing token within expected bounds.
|
||||
*/
|
||||
private int findClosingIndex(int from, int to) {
|
||||
if (from < to) {
|
||||
String currToken = tokenArray[from];
|
||||
|
||||
if ("\"".equals(currToken)) {
|
||||
for (int i = from + 1; i <= to; i++) {
|
||||
if ("\"".equals(tokenArray[i])) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
} else if ("'".equals(currToken)) {
|
||||
for (int i = from + 1; i <= to; i++) {
|
||||
if ("'".equals(tokenArray[i])) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
} else if ( "(".equals(currToken) || "[".equals(currToken) || "/".equals(currToken) ) {
|
||||
boolean isQuoteClosed = true;
|
||||
boolean isAposClosed = true;
|
||||
int brackets = "(".equals(currToken) ? 1 : 0;
|
||||
int angleBrackets = "[".equals(currToken) ? 1 : 0;
|
||||
int slashes = "/".equals(currToken) ? 1 : 0;
|
||||
for (int i = from + 1; i <= to; i++) {
|
||||
if ( "\"".equals(tokenArray[i]) ) {
|
||||
isQuoteClosed = !isQuoteClosed;
|
||||
} else if ( "'".equals(tokenArray[i]) ) {
|
||||
isAposClosed = !isAposClosed;
|
||||
} else if ( "(".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
|
||||
brackets++;
|
||||
} else if ( ")".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
|
||||
brackets--;
|
||||
} else if ( "[".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
|
||||
angleBrackets++;
|
||||
} else if ( "]".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
|
||||
angleBrackets--;
|
||||
} else if ( "/".equals(tokenArray[i]) && isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0) {
|
||||
slashes--;
|
||||
}
|
||||
|
||||
if (isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0 && slashes == 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if token is attribute (starts with @)
|
||||
* @param token
|
||||
*/
|
||||
private boolean isAtt(String token) {
|
||||
return token != null && token.length() > 1 && token.startsWith("@");
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates one-element collection for the specified object.
|
||||
* @param element
|
||||
*/
|
||||
private Collection singleton(Object element) {
|
||||
ArrayList result = new ArrayList();
|
||||
result.add(element);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* For the given source collection and specified name, returns collection of subnodes
|
||||
* or attribute values.
|
||||
* @param source
|
||||
* @param from
|
||||
* @param to
|
||||
* @param isRecursive
|
||||
* @return Colection of TagNode instances or collection of String instances.
|
||||
*/
|
||||
private Collection getElementsByName(Collection source, int from, int to, boolean isRecursive, boolean isFilterContext) throws XPatherException {
|
||||
String name = tokenArray[from].trim();
|
||||
|
||||
if (isAtt(name)) {
|
||||
name = name.substring(1);
|
||||
Collection result = new ArrayList();
|
||||
Collection nodes;
|
||||
if (isRecursive) {
|
||||
nodes = new LinkedHashSet();
|
||||
Iterator iterator = source.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Object next = iterator.next();
|
||||
if (next instanceof TagNode) {
|
||||
TagNode node = (TagNode) next;
|
||||
nodes.addAll( node.getAllElementsList(true) );
|
||||
}
|
||||
}
|
||||
} else {
|
||||
nodes = source;
|
||||
}
|
||||
|
||||
Iterator iterator = nodes.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Object next = iterator.next();
|
||||
if (next instanceof TagNode) {
|
||||
TagNode node = (TagNode) next;
|
||||
if ("*".equals(name)) {
|
||||
result.addAll( evaluateAgainst(node.getAttributes().values(), from + 1, to, false, 1, 1, isFilterContext, null) );
|
||||
} else {
|
||||
String attValue = node.getAttributeByName(name);
|
||||
if (attValue != null) {
|
||||
result.addAll( evaluateAgainst(singleton(attValue), from + 1, to, false, 1, 1, isFilterContext, null) );
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throwStandardException();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
} else {
|
||||
Collection result = new LinkedHashSet();
|
||||
Iterator iterator = source.iterator();
|
||||
int index = 0;
|
||||
while (iterator.hasNext()) {
|
||||
final Object next = iterator.next();
|
||||
if (next instanceof TagNode) {
|
||||
TagNode node = (TagNode) next;
|
||||
index++;
|
||||
boolean isSelf = ".".equals(name);
|
||||
boolean isParent = "..".equals(name);
|
||||
boolean isAll = "*".equals(name);
|
||||
|
||||
Collection subnodes;
|
||||
if (isSelf) {
|
||||
subnodes = singleton(node);
|
||||
} else if (isParent) {
|
||||
TagNode parent = node.getParent();
|
||||
subnodes = parent != null ? singleton(parent) : new ArrayList();
|
||||
} else {
|
||||
subnodes = isAll ? node.getChildTagList() : node.getElementListByName(name, false);
|
||||
}
|
||||
|
||||
LinkedHashSet nodeSet = new LinkedHashSet(subnodes);
|
||||
Collection refinedSubnodes = evaluateAgainst(nodeSet, from + 1, to, false, index, nodeSet.size(), isFilterContext, null);
|
||||
|
||||
if (isRecursive) {
|
||||
List childTags = node.getChildTagList();
|
||||
if (isSelf || isParent || isAll) {
|
||||
result.addAll(refinedSubnodes);
|
||||
}
|
||||
Iterator childIterator = childTags.iterator();
|
||||
while (childIterator.hasNext()) {
|
||||
TagNode childTag = (TagNode) childIterator.next();
|
||||
Collection childrenByName = getElementsByName(singleton(childTag), from, to, isRecursive, isFilterContext);
|
||||
if ( !isSelf && !isParent && !isAll && refinedSubnodes.contains(childTag) ) {
|
||||
result.add(childTag);
|
||||
}
|
||||
result.addAll(childrenByName);
|
||||
}
|
||||
} else {
|
||||
result.addAll(refinedSubnodes);
|
||||
}
|
||||
} else {
|
||||
throwStandardException();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluates logic operation on two collections.
|
||||
* @param first
|
||||
* @param second
|
||||
* @param logicOperator
|
||||
* @return Result of logic operation
|
||||
*/
|
||||
protected boolean evaluateLogic(Collection first, Collection second, String logicOperator) {
|
||||
if (first == null || first.size() == 0 || second == null || second.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
Object elem1 = first.iterator().next();
|
||||
Object elem2 = second.iterator().next();
|
||||
if (elem1 instanceof Number && elem2 instanceof Number) {
|
||||
double d1 = ((Number)elem1).doubleValue();
|
||||
double d2 = ((Number)elem2).doubleValue();
|
||||
if ("=".equals(logicOperator)) {
|
||||
return d1 == d2;
|
||||
} else if ("<".equals(logicOperator)) {
|
||||
return d1 < d2;
|
||||
} else if (">".equals(logicOperator)) {
|
||||
return d1 > d2;
|
||||
} else if ("<=".equals(logicOperator)) {
|
||||
return d1 <= d2;
|
||||
} else if (">=".equals(logicOperator)) {
|
||||
return d1 >= d2;
|
||||
}
|
||||
} else {
|
||||
String s1 = toText(elem1);
|
||||
String s2 = toText(elem2);
|
||||
int result = s1.compareTo(s2);
|
||||
if ("=".equals(logicOperator)) {
|
||||
return result == 0;
|
||||
} else if ("<".equals(logicOperator)) {
|
||||
return result < 0;
|
||||
} else if (">".equals(logicOperator)) {
|
||||
return result > 0;
|
||||
} else if ("<=".equals(logicOperator)) {
|
||||
return result <= 0;
|
||||
} else if (">=".equals(logicOperator)) {
|
||||
return result >= 0;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private String toText(Object o) {
|
||||
if (o == null) {
|
||||
return "";
|
||||
} if (o instanceof TagNode) {
|
||||
return ((TagNode)o).getText().toString();
|
||||
} else {
|
||||
return o.toString();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
|
||||
/**
|
||||
* <p>Exception that could occure during XPather evaluation.</p>
|
||||
*/
|
||||
public class XPatherException extends Exception {
|
||||
|
||||
public XPatherException() {
|
||||
this("Error in evaluating XPath expression!");
|
||||
}
|
||||
|
||||
public XPatherException(Throwable cause) {
|
||||
super(cause);
|
||||
}
|
||||
|
||||
public XPatherException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public XPatherException(String message, Throwable cause) {
|
||||
super(message, cause);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,313 @@
|
||||
/* Copyright (c) 2006-2007, Vladimir Nikic
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* <p>Abstract XML serializer - contains common logic for descendants.</p>
|
||||
*/
|
||||
public abstract class XmlSerializer extends Serializer {
|
||||
|
||||
public static final String XMLNS_NAMESPACE = "xmlns";
|
||||
|
||||
protected XmlSerializer(CleanerProperties props) {
|
||||
super(props);
|
||||
}
|
||||
|
||||
private boolean creatingHtmlDom;
|
||||
|
||||
/**
|
||||
* @param creatingHtmlDom the creatingHtmlDom to set
|
||||
*/
|
||||
public void setCreatingHtmlDom(boolean creatingHtmlDom) {
|
||||
this.creatingHtmlDom = creatingHtmlDom;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the creatingHtmlDom
|
||||
*/
|
||||
public boolean isCreatingHtmlDom() {
|
||||
return creatingHtmlDom;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use writeToStream() instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public void writeXmlToStream(TagNode tagNode, OutputStream out, String charset) throws IOException {
|
||||
super.writeToStream(tagNode, out, charset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use writeToStream() instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public void writeXmlToStream(TagNode tagNode, OutputStream out) throws IOException {
|
||||
super.writeToStream(tagNode, out);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use writeToFile() instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public void writeXmlToFile(TagNode tagNode, String fileName, String charset) throws IOException {
|
||||
super.writeToFile(tagNode, fileName, charset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use writeToFile() instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public void writeXmlToFile(TagNode tagNode, String fileName) throws IOException {
|
||||
super.writeToFile(tagNode, fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use getAsString() instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public String getXmlAsString(TagNode tagNode, String charset) {
|
||||
return super.getAsString(tagNode, charset);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use getAsString() instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public String getXmlAsString(TagNode tagNode) {
|
||||
return super.getAsString(tagNode);
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Use write() instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public void writeXml(TagNode tagNode, Writer writer, String charset) throws IOException {
|
||||
super.write(tagNode, writer, charset);
|
||||
}
|
||||
|
||||
protected String escapeXml(String xmlContent) {
|
||||
return Utils.escapeXml(xmlContent, props, isCreatingHtmlDom());
|
||||
}
|
||||
|
||||
protected boolean dontEscape(TagNode tagNode) {
|
||||
return props.isUseCdataFor(tagNode.getName());
|
||||
}
|
||||
|
||||
protected boolean isMinimizedTagSyntax(TagNode tagNode) {
|
||||
final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
|
||||
return tagNode.isEmpty() && (tagInfo == null || tagInfo.isMinimizedTagPermitted()) &&
|
||||
( props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag()) );
|
||||
}
|
||||
protected void serializeOpenTag(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializeOpenTag(tagNode, writer, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize a CDATA section. If the context is a script or style tag, and
|
||||
* using CDATA for script and style is set to true, then we just write the
|
||||
* actual content, as the whole section is wrapped in CDATA tokens.
|
||||
* Otherwise we escape the content as if it were regular text.
|
||||
*
|
||||
* @param item the CDATA instance
|
||||
* @param tagNode the TagNode within which the CDATA appears
|
||||
* @param writer the writer to output to
|
||||
* @throws IOException
|
||||
*/
|
||||
protected void serializeCData(CData item, TagNode tagNode, Writer writer) throws IOException{
|
||||
if (dontEscape(tagNode)){
|
||||
writer.write(item.getContentWithoutStartAndEndTokens());
|
||||
} else {
|
||||
writer.write(escapeXml(item.getContentWithStartAndEndTokens()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize a content token, escaping where necessary.
|
||||
* @param item the content token to serialize
|
||||
* @param tagNode the TagNode within which the content token appears
|
||||
* @param writer the writer to output to
|
||||
* @throws IOException
|
||||
*/
|
||||
protected void serializeContentToken(ContentNode item, TagNode tagNode, Writer writer) throws IOException {
|
||||
if (dontEscape(tagNode)){
|
||||
writer.write(item.getContent());
|
||||
}else {
|
||||
writer.write( escapeXml(item.getContent()) );
|
||||
}
|
||||
}
|
||||
|
||||
protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
|
||||
if ( !isForbiddenTag(tagNode)) {
|
||||
String tagName = tagNode.getName();
|
||||
|
||||
//
|
||||
// Ensure we use valid XML element names
|
||||
//
|
||||
tagName = Utils.sanitizeXmlIdentifier(tagName);
|
||||
|
||||
Map<String, String> tagAtttributes = tagNode.getAttributes();
|
||||
|
||||
// always have head and body in newline
|
||||
if (props.isAddNewlineToHeadAndBody() && isHeadOrBody(tagName)) {
|
||||
writer.write("\n");
|
||||
}
|
||||
|
||||
writer.write("<" + tagName);
|
||||
Iterator<Map.Entry<String, String>> it = tagAtttributes.entrySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Map.Entry<String, String> entry = (Map.Entry<String,String>) it.next();
|
||||
String attName = (String) entry.getKey();
|
||||
String attValue = (String) entry.getValue();
|
||||
serializeAttribute(tagNode, writer, attName, attValue);
|
||||
}
|
||||
|
||||
if ( isMinimizedTagSyntax(tagNode) ) {
|
||||
writer.write(" />");
|
||||
if (newLine) {
|
||||
writer.write("\n");
|
||||
}
|
||||
} else if (dontEscape(tagNode)) {
|
||||
// because we are not considering if the file is xhtml or html,
|
||||
// we need to put a javascript comment in front of the CDATA in case this is NOT xhtml
|
||||
writer.write(">");
|
||||
if (!tagNode.getText().toString().startsWith(CData.SAFE_BEGIN_CDATA)) {
|
||||
writer.write(CData.SAFE_BEGIN_CDATA);
|
||||
//
|
||||
// Insert a newline after the CDATA start marker if there isn't
|
||||
// already a newline character there
|
||||
//
|
||||
if (!tagNode.getText().toString().equals("")){
|
||||
char firstchar = tagNode.getText().toString().charAt(0);
|
||||
if (firstchar != '\n' && firstchar !='\r') writer.write("\n");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
writer.write(">");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tagNode
|
||||
* @return true if the tag is forbidden
|
||||
*/
|
||||
protected boolean isForbiddenTag(TagNode tagNode) {
|
||||
// null tagName when rootNode is a dummy node.
|
||||
// this happens when omitting the html envelope elements ( <html>, <head>, <body> elements )
|
||||
String tagName = tagNode.getName();
|
||||
return tagName == null;
|
||||
}
|
||||
|
||||
protected boolean isHeadOrBody(String tagName) {
|
||||
return "head".equalsIgnoreCase(tagName) || "body".equalsIgnoreCase(tagName);
|
||||
}
|
||||
|
||||
/**
|
||||
* This allows overriding to eliminate forbidden attributes (for example javascript attributes onclick, onblur, etc. )
|
||||
* @param writer
|
||||
* @param attName
|
||||
* @param attValue
|
||||
* @throws IOException
|
||||
*/
|
||||
protected void serializeAttribute(TagNode tagNode, Writer writer, String attName, String attValue) throws IOException {
|
||||
//
|
||||
// For XML, we can't use the lax definition of attribute names used in HTML5, so
|
||||
// we have to replace any invalid ones with a generated attribute name, or skip
|
||||
// them entirely.
|
||||
//
|
||||
if (!props.isAllowInvalidAttributeNames()){
|
||||
attName = Utils.sanitizeXmlIdentifier(attName, props.getInvalidXmlAttributeNamePrefix());
|
||||
}
|
||||
|
||||
if (attName != null && (Utils.isValidXmlIdentifier(attName) || props.isAllowInvalidAttributeNames()) && !isForbiddenAttribute(tagNode, attName, attValue)) {
|
||||
writer.write(" " + attName + "=\"" + escapeXml(attValue) + "\"");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Override to add additional conditions.
|
||||
* @param tagNode
|
||||
* @param attName
|
||||
* @param value
|
||||
* @return true if the attribute should not be outputed.
|
||||
*/
|
||||
protected boolean isForbiddenAttribute(TagNode tagNode, String attName, String value) {
|
||||
return !props.isNamespacesAware() && (XMLNS_NAMESPACE.equals(attName) || attName.startsWith(XMLNS_NAMESPACE +":"));
|
||||
}
|
||||
|
||||
protected void serializeEndTag(TagNode tagNode, Writer writer) throws IOException {
|
||||
serializeEndTag(tagNode, writer, true);
|
||||
}
|
||||
|
||||
protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
|
||||
if ( !isForbiddenTag(tagNode)) {
|
||||
String tagName = tagNode.getName();
|
||||
//
|
||||
// Ensure we use valid XML element names
|
||||
//
|
||||
tagName = Utils.sanitizeXmlIdentifier(tagName);
|
||||
if (dontEscape(tagNode)) {
|
||||
// because we are not considering if the file is xhtml or html,
|
||||
// we need to put a javascript comment in front of the CDATA in case this is NOT xhtml
|
||||
|
||||
if (!tagNode.getText().toString().trim().endsWith(CData.SAFE_END_CDATA)) {
|
||||
//
|
||||
// Insert a newline character before the CDATA end marker if there isn't one
|
||||
// already at the end of the tag node content
|
||||
//
|
||||
if (tagNode.getText().toString().length() > 0){
|
||||
char lastchar = tagNode.getText().toString().charAt(tagNode.getText().toString().length()-1);
|
||||
if (lastchar != '\n' && lastchar !='\r') writer.write("\n");
|
||||
}
|
||||
// Write the CDATA end marker
|
||||
writer.write(CData.SAFE_END_CDATA);
|
||||
}
|
||||
}
|
||||
|
||||
writer.write( "</" + tagName + ">" );
|
||||
|
||||
if (newLine) {
|
||||
writer.write("\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
|
||||
* <p>
|
||||
* This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
|
||||
* </p>
|
||||
*/
|
||||
public class XmlTraversor {
|
||||
private XmlVisitor visitor;
|
||||
|
||||
/**
|
||||
* Start a depth-first traverse of the root and all of its descendants.
|
||||
* @param visitor Node visitor.
|
||||
* @param root the root node point to traverse.
|
||||
*/
|
||||
public static void traverse(XmlVisitor visitor, HtmlNode root) {
|
||||
HtmlNode node = root;
|
||||
int depth = 0;
|
||||
|
||||
while (node != null) {
|
||||
visitor.head(node, depth);
|
||||
if ( node instanceof TagNode && ((TagNode)node).hasChildren() ) {
|
||||
node = (HtmlNode)((TagNode)node).getAllChildren().get(0);
|
||||
depth++;
|
||||
} else {
|
||||
List<? extends BaseToken> siblings = node.getSiblings();
|
||||
Iterator<? extends BaseToken> it = siblings.iterator();
|
||||
while (it.hasNext() && it.next() == null && depth > 0) {
|
||||
visitor.tail(node, depth);
|
||||
node = node.getParent();
|
||||
depth--;
|
||||
}
|
||||
visitor.tail(node, depth);
|
||||
if (node == root)
|
||||
break;
|
||||
if (it.hasNext()){
|
||||
node = (HtmlNode)it.next();
|
||||
} else {
|
||||
node = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
/**
|
||||
* Node visitor interface. Provide an implementing class to {@link XmlTraversor} to iterate through nodes.
|
||||
* <p>
|
||||
* This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
|
||||
* seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
|
||||
* create a start tag for a node, and tail to create the end tag.
|
||||
* </p>
|
||||
*/
|
||||
public interface XmlVisitor {
|
||||
/**
|
||||
* Callback for when a node is first visited.
|
||||
*
|
||||
* @param node the node being visited.
|
||||
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
|
||||
* of that will have depth 1.
|
||||
*/
|
||||
void head(HtmlNode node, int depth);
|
||||
|
||||
/**
|
||||
* Callback for when a node is last visited, after all of its descendants have been visited.
|
||||
*
|
||||
* @param node the node being visited.
|
||||
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
|
||||
* of that will have depth 1.
|
||||
*/
|
||||
void tail(HtmlNode node, int depth);
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
package org.htmlcleaner.audit;
|
||||
|
||||
/**
|
||||
* Possible error codes (read messages) that cleaner uses to inform clients about reasons/actions that modification
|
||||
* involves.
|
||||
* @author Konstantin Burov (aectann@gmail.com)
|
||||
*/
|
||||
public enum ErrorType {
|
||||
|
||||
/**
|
||||
* Tag which existence is <i>critical</i> for the current is missing. Most likely, current tag was pruned. Unlike
|
||||
* the {@link #RequiredParentMissing} this reports the problem when cleaner removed the tag instead of creating as
|
||||
* parent. See {@link org.htmlcleaner.TagInfo} for more detailed description of fatal and required tags.
|
||||
* <p>
|
||||
* <b>Example:</b>
|
||||
* <ul>
|
||||
* <li><option> tag without parent select
|
||||
* <li><tr> tag without parent <table>
|
||||
* <li>...
|
||||
* </ul>
|
||||
*/
|
||||
FatalTagMissing,
|
||||
/**
|
||||
* The tag wasn't found on list of allowed tags, thus it was removed.
|
||||
*/
|
||||
NotAllowedTag,
|
||||
/**
|
||||
* Missing parent tag was added for current (i.e. tbody for tr).
|
||||
*/
|
||||
RequiredParentMissing,
|
||||
/**
|
||||
* No matching close token was found for the open tag. Tag was closed automatically.
|
||||
* <p>
|
||||
* <b>Example:</b>
|
||||
* <p>
|
||||
* <p>Some text..
|
||||
* <p>
|
||||
* Unclosed <p> tag.
|
||||
*/
|
||||
UnclosedTag,
|
||||
/**
|
||||
* Second instance of an unique tag was found, most likely it was removed.
|
||||
* <p>
|
||||
* <b>Example:</b>
|
||||
* <p>
|
||||
*
|
||||
* <pre>
|
||||
* <head>
|
||||
* <title>Some text</title>
|
||||
* <title>Some more text</title>
|
||||
* </head>
|
||||
* <p>
|
||||
* </pre>
|
||||
*/
|
||||
UniqueTagDuplicated,
|
||||
/**
|
||||
* The tag was deprecated and current cleaner mode doesn't allows this. The tag was removed.
|
||||
* <p>
|
||||
* <b>Example:</b>
|
||||
* <ul>
|
||||
* <li><u>
|
||||
* <li><s>
|
||||
* <li><srtike>
|
||||
* <li>....
|
||||
* </ul>
|
||||
*/
|
||||
Deprecated,
|
||||
/**
|
||||
* This tag have bad child that shouldn't be here. Thus the tag is closed automatically to avoid such inclusion.
|
||||
* <p>
|
||||
* <b>Example:</b>
|
||||
* <p>
|
||||
* <p>Some text <table>...</table><p>
|
||||
* <p>
|
||||
* <table> is not allowed to be child of <p>, thus <p> is closed before the <table>
|
||||
*/
|
||||
UnpermittedChild,
|
||||
|
||||
/**
|
||||
* The tag is unknown and current cleaner mode doesn't allows this. The tag was removed.
|
||||
* <p>
|
||||
* <b>Example:</b>
|
||||
* <ul>
|
||||
* <li><any>
|
||||
* <li><tag>
|
||||
* <li>....
|
||||
* </ul>
|
||||
*/
|
||||
Unknown
|
||||
}
|
||||
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
package org.htmlcleaner.audit;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.htmlcleaner.conditional.ITagNodeCondition;
|
||||
|
||||
/**
|
||||
* Implementors can be registered on {@link org.htmlcleaner.CleanerProperties} to receive notifications about
|
||||
* modifications made by html cleaner.
|
||||
*
|
||||
* @author Konstantin Burov (aectann@gmail.com)
|
||||
*
|
||||
*/
|
||||
public interface HtmlModificationListener {
|
||||
|
||||
/**
|
||||
* Fired when cleaner fixes some error in html syntax.
|
||||
*
|
||||
* @param certain - true if change made doesn't hurts end document.
|
||||
* @param tagNode - problematic node.
|
||||
* @param errorType
|
||||
*/
|
||||
void fireHtmlError(boolean certain, TagNode tagNode, ErrorType errorType);
|
||||
|
||||
/**
|
||||
* Fired when cleaner fixes ugly html -- when syntax was correct but task was implemented by weird code.
|
||||
* For example when deprecated tags are removed.
|
||||
*
|
||||
* @param certainty - true if change made doesn't hurts end document.
|
||||
* @param tagNode - problematic node.
|
||||
* @param errorType
|
||||
*/
|
||||
void fireUglyHtml(boolean certainty, TagNode tagNode, ErrorType errorType);
|
||||
|
||||
/**
|
||||
* Fired when cleaner modifies html due to {@link ITagNodeCondition} match.
|
||||
*
|
||||
* @param condition that was applied to make the modification
|
||||
* @param tagNode - problematic node.
|
||||
*/
|
||||
void fireConditionModification(ITagNodeCondition condition, TagNode tagNode);
|
||||
|
||||
/**
|
||||
* Fired when cleaner modifies html due to user specified rules.
|
||||
*
|
||||
* @param certainty - true if change made doesn't hurts end document.
|
||||
* @param tagNode - problematic node.
|
||||
* @param errorType
|
||||
*/
|
||||
void fireUserDefinedModification(boolean certainty, TagNode tagNode, ErrorType errorType);
|
||||
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
package org.htmlcleaner.audit;
|
||||
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
import org.htmlcleaner.conditional.ITagNodeCondition;
|
||||
|
||||
public class HtmlModificationListenerLogger implements HtmlModificationListener {
|
||||
|
||||
|
||||
private Logger log;
|
||||
|
||||
public HtmlModificationListenerLogger(Logger log) {
|
||||
this.log = log;
|
||||
}
|
||||
public void fireConditionModification(ITagNodeCondition condition, TagNode tagNode) {
|
||||
this.log.info("fireConditionModification:"+condition+" at "+tagNode);
|
||||
}
|
||||
|
||||
public void fireHtmlError(boolean safety, TagNode tagNode, ErrorType errorType) {
|
||||
this.log.info("fireHtmlError:"+errorType+"("+safety+") at "+tagNode);
|
||||
}
|
||||
|
||||
public void fireUglyHtml(boolean safety, TagNode tagNode, ErrorType errorType) {
|
||||
this.log.info("fireConditionModification:"+errorType+"("+safety+") at "+tagNode);
|
||||
}
|
||||
|
||||
public void fireUserDefinedModification(boolean safety, TagNode tagNode, ErrorType errorType) {
|
||||
this.log.info("fireConditionModification"+errorType+"("+safety+") at "+tagNode);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* Used as base for different node checkers.
|
||||
*/
|
||||
public interface ITagNodeCondition {
|
||||
public boolean satisfy(TagNode tagNode);
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* All nodes.
|
||||
*/
|
||||
public class TagAllCondition implements ITagNodeCondition {
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* Checks if node contains specified attribute.
|
||||
*/
|
||||
public class TagNodeAttExistsCondition implements ITagNodeCondition {
|
||||
private String attName;
|
||||
|
||||
public TagNodeAttExistsCondition(String attName) {
|
||||
this.attName = attName;
|
||||
}
|
||||
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
return tagNode == null ? false : tagNode.getAttributes().containsKey( attName.toLowerCase() );
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* Checks if node has specified attribute with specified value.
|
||||
*/
|
||||
public class TagNodeAttNameValueRegexCondition implements ITagNodeCondition {
|
||||
private Pattern attNameRegex;
|
||||
private Pattern attValueRegex;
|
||||
|
||||
public TagNodeAttNameValueRegexCondition(Pattern attNameRegex, Pattern attValueRegex) {
|
||||
this.attNameRegex = attNameRegex;
|
||||
this.attValueRegex = attValueRegex;
|
||||
}
|
||||
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
if (tagNode != null ) {
|
||||
for(Map.Entry<String, String>entry: tagNode.getAttributes().entrySet()) {
|
||||
if ( (attNameRegex == null || attNameRegex.matcher(entry.getKey()).find()) && (attValueRegex == null || attValueRegex.matcher( entry.getValue() ).find())) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,28 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* Checks if node has specified attribute with specified value.
|
||||
*/
|
||||
public class TagNodeAttValueCondition implements ITagNodeCondition {
|
||||
private String attName;
|
||||
private String attValue;
|
||||
private boolean isCaseSensitive;
|
||||
|
||||
public TagNodeAttValueCondition(String attName, String attValue, boolean isCaseSensitive) {
|
||||
this.attName = attName;
|
||||
this.attValue = attValue;
|
||||
this.isCaseSensitive = isCaseSensitive;
|
||||
}
|
||||
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
if (tagNode == null || attName == null || attValue == null) {
|
||||
return false;
|
||||
} else {
|
||||
return isCaseSensitive ?
|
||||
attValue.equals( tagNode.getAttributeByName(attName) ) :
|
||||
attValue.equalsIgnoreCase( tagNode.getAttributeByName(attName) );
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,25 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* Remove empty autogenerated nodes. These nodes are created when an unclosed tag is immediately closed.
|
||||
* @author patmoore
|
||||
*
|
||||
*/
|
||||
public class TagNodeAutoGeneratedCondition implements ITagNodeCondition {
|
||||
|
||||
public static final TagNodeAutoGeneratedCondition INSTANCE = new TagNodeAutoGeneratedCondition();
|
||||
/**
|
||||
* @see org.htmlcleaner.conditional.ITagNodeCondition#satisfy(org.htmlcleaner.TagNode)
|
||||
*/
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
// auto-generated node that is not needed.
|
||||
return tagNode.isAutoGenerated() && tagNode.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "auto generated tagNode";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.htmlcleaner.ContentNode;
|
||||
import org.htmlcleaner.ITagInfoProvider;
|
||||
import org.htmlcleaner.TagInfo;
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
import static org.htmlcleaner.Utils.isEmptyString;
|
||||
import static org.htmlcleaner.Display.*;
|
||||
|
||||
/**
|
||||
* Checks if node is an <b>inline</b> 0r block element and has empty contents or white/non-breakable spaces only. Nodes that have
|
||||
* non-empty id attribute are considered to be non-empty, since they can be used in javascript scenarios.
|
||||
*
|
||||
* Examples that should be pruned,
|
||||
* <pre>
|
||||
* <u> </u>
|
||||
* <table><tr><td></td</tr></table>
|
||||
* </pre>
|
||||
*
|
||||
* Examples of code that should NOT be pruned:
|
||||
*
|
||||
* <pre>
|
||||
* <p><img/></p> - no content but image tags do not have text content.
|
||||
* <table<tr><td/><td>hi</td></tr> - the first (empty) td is a placeholder so the second td is in the correct column
|
||||
* </pre>
|
||||
* @author Konstantin Burov
|
||||
*/
|
||||
public class TagNodeEmptyContentCondition implements ITagNodeCondition {
|
||||
|
||||
private static final String ID_ATTRIBUTE_NAME = "id";
|
||||
|
||||
/**
|
||||
* Removal of element from this set can affect layout too hard.
|
||||
*/
|
||||
private static final Set < String > unsafeBlockElements = new HashSet < String >();
|
||||
|
||||
static {
|
||||
// cannot just remove a td unless removing the entire row. td's are place holders
|
||||
unsafeBlockElements.add("td");
|
||||
unsafeBlockElements.add("th");
|
||||
}
|
||||
private ITagInfoProvider tagInfoProvider;
|
||||
|
||||
public TagNodeEmptyContentCondition(ITagInfoProvider provider) {
|
||||
this.tagInfoProvider = provider;
|
||||
}
|
||||
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
return satisfy(tagNode, false);
|
||||
}
|
||||
private boolean satisfy(TagNode tagNode, boolean override) {
|
||||
String name = tagNode.getName();
|
||||
TagInfo tagInfo = tagInfoProvider.getTagInfo(name);
|
||||
//Only _block_ elements can match.
|
||||
if (tagInfo != null && !hasIdAttributeSet(tagNode) && none != tagInfo.getDisplay() && !tagInfo.isEmptyTag() && (override || !unsafeBlockElements.contains(name))) {
|
||||
CharSequence contentString = tagNode.getText();
|
||||
if(isEmptyString(contentString)) {
|
||||
// even though there may be no text need to make sure all children are empty or can be pruned
|
||||
if (tagNode.isEmpty()) {
|
||||
return true;
|
||||
} else {
|
||||
for(Object child: tagNode.getAllChildren()) {
|
||||
// TODO : similar check as in tagNode.isEmpty() argues for a visitor pattern
|
||||
// but allow empty td, ths to be pruned.
|
||||
if ( child instanceof TagNode) {
|
||||
if (!satisfy((TagNode)child, true)) {
|
||||
return false;
|
||||
}
|
||||
} else if (child instanceof ContentNode ) {
|
||||
if ( !((ContentNode)child).isBlank()) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean hasIdAttributeSet(TagNode tagNode) {
|
||||
Map < String, String > attributes = tagNode.getAttributes();
|
||||
return !isEmptyString(attributes.get(ID_ATTRIBUTE_NAME));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* Checks if node is an insignificant br tag -- is placed at the end or at the
|
||||
* start of a block.
|
||||
*
|
||||
* @author Konstantin Burov (aectann@gmail.com)
|
||||
*/
|
||||
public class TagNodeInsignificantBrCondition implements ITagNodeCondition {
|
||||
|
||||
private static final String BR_TAG = "br";
|
||||
|
||||
public TagNodeInsignificantBrCondition() {
|
||||
}
|
||||
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
if (!isBrNode(tagNode)) {
|
||||
return false;
|
||||
}
|
||||
TagNode parent = tagNode.getParent();
|
||||
List children = parent.getAllChildren();
|
||||
int brIndex = children.indexOf(tagNode);
|
||||
return checkSublist(0, brIndex, children) || checkSublist (brIndex, children.size(), children);
|
||||
}
|
||||
|
||||
private boolean isBrNode(TagNode tagNode) {
|
||||
return tagNode != null && BR_TAG.equals(tagNode.getName());
|
||||
}
|
||||
|
||||
private boolean checkSublist(int start, int end, List list) {
|
||||
List sublist = list.subList(start, end);
|
||||
for (Object object : sublist) {
|
||||
if(!(object instanceof TagNode)){
|
||||
return false;
|
||||
}
|
||||
TagNode node = (TagNode) object;
|
||||
if(!isBrNode(node)&&!node.isPruned()){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
package org.htmlcleaner.conditional;
|
||||
|
||||
import org.htmlcleaner.TagNode;
|
||||
|
||||
/**
|
||||
* Checks if node has specified name.
|
||||
*/
|
||||
public class TagNodeNameCondition implements ITagNodeCondition {
|
||||
private String name;
|
||||
|
||||
public TagNodeNameCondition(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public boolean satisfy(TagNode tagNode) {
|
||||
return tagNode == null ? false : tagNode.getName().equalsIgnoreCase(this.name);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,220 @@
|
||||
/* Copyright (c) 2006-2013, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.jdom2.input.DOMBuilder;
|
||||
import org.jdom2.output.Format;
|
||||
import org.jdom2.output.XMLOutputter;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.w3c.dom.Document;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
|
||||
/**
|
||||
* Abstract test class with utility methods
|
||||
*/
|
||||
public abstract class AbstractHtmlCleanerTest {
|
||||
|
||||
protected HtmlCleaner cleaner;
|
||||
protected Serializer serializer;
|
||||
|
||||
@Before
|
||||
public void setup(){
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(true);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setAdvancedXmlEscape(true);
|
||||
cleanerProperties.setTranslateSpecialEntities(false);
|
||||
cleanerProperties.setOmitComments(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
|
||||
cleaner = new HtmlCleaner(cleanerProperties);
|
||||
serializer = new SimpleXmlSerializer(cleanerProperties);
|
||||
}
|
||||
|
||||
protected void assertCleaned(String initial, String expected) throws IOException {
|
||||
TagNode node = cleaner.clean(initial);
|
||||
StringWriter writer = new StringWriter();
|
||||
serializer.write(node, writer, "UTF-8");
|
||||
assertEquals(expected, writer.toString());
|
||||
}
|
||||
|
||||
protected void assertCleanedHtml(String initial, String expected) throws IOException {
|
||||
TagNode node = cleaner.clean(initial);
|
||||
StringWriter writer = new StringWriter();
|
||||
Serializer ser = new SimpleHtmlSerializer(cleaner.getProperties());
|
||||
ser.write(node, writer, "UTF-8");
|
||||
assertEquals(expected, writer.toString());
|
||||
}
|
||||
|
||||
protected void assertCleanedDom(String initial, String expected) throws Exception {
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(false);
|
||||
TagNode node = cleaner.clean(initial);
|
||||
StringWriter writer = new StringWriter();
|
||||
DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
|
||||
Document document = domSerializer.createDOM(node);
|
||||
TransformerFactory tf = TransformerFactory.newInstance();
|
||||
Transformer transformer = tf.newTransformer();
|
||||
transformer.transform(new DOMSource(document), new StreamResult(writer));
|
||||
String rawActual = writer.getBuffer().toString();
|
||||
|
||||
String[] lines = rawActual.split("\n");
|
||||
StringWriter buffer = new StringWriter();
|
||||
for (String line : lines) {
|
||||
buffer.write(line.trim());
|
||||
buffer.write("\n");
|
||||
}
|
||||
String actual = buffer.toString();
|
||||
actual = actual.substring(actual.indexOf("<body>\n")+7, actual.indexOf("</body>")).trim();
|
||||
assertEquals(expected, actual);
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(true);
|
||||
}
|
||||
|
||||
protected void assertCleanedJDom(String initial, String expected) throws Exception {
|
||||
boolean env = cleaner.getProperties().isOmitHtmlEnvelope();
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(false);
|
||||
TagNode node = cleaner.clean(initial);
|
||||
StringWriter writer = new StringWriter();
|
||||
JDomSerializer domSerializer = new JDomSerializer(cleaner.getProperties());
|
||||
org.jdom2.Document document = domSerializer.createJDom(node);
|
||||
XMLOutputter out = new XMLOutputter();
|
||||
out.output(document, writer);
|
||||
String actual = writer.getBuffer().toString();
|
||||
actual = actual.substring(actual.indexOf("<body>")+6, actual.indexOf("</body>"));
|
||||
assertEquals(expected, actual);
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(env);
|
||||
}
|
||||
|
||||
protected String readFile(String filename) throws IOException {
|
||||
File file = new File(filename);
|
||||
CharSequence content = Utils.readUrl(file.toURI().toURL(), "UTF-8");
|
||||
return content.toString();
|
||||
}
|
||||
|
||||
public static final String HEADER =
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
|
||||
//+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" "
|
||||
//+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n";
|
||||
private static final String HEADER_FULL = HEADER + "<html><head /><body>";
|
||||
private static final String FOOTER = "</body></html>";
|
||||
|
||||
protected void assertHTML(String expected, String input) throws IOException {
|
||||
StringWriter writer = new StringWriter();
|
||||
serializer.write(cleaner.clean(input), writer, "UTF-8");
|
||||
String actual = writer.toString();
|
||||
|
||||
Assert.assertEquals(HEADER_FULL + expected + FOOTER, actual);
|
||||
}
|
||||
|
||||
protected void assertHTMLWithHeader(String expected, String input) throws IOException {
|
||||
StringWriter writer = new StringWriter();
|
||||
serializer.write(cleaner.clean(input), writer, "UTF-8");
|
||||
String actual = writer.toString();
|
||||
|
||||
Assert.assertEquals(HEADER + expected, actual);
|
||||
}
|
||||
|
||||
protected void assertHTMLUsingDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties());
|
||||
|
||||
Document document = ser.createDOM(cleaner.clean(input));
|
||||
|
||||
DOMBuilder in = new DOMBuilder();
|
||||
org.jdom2.Document jdomDoc = in.build(document);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String actual = outputter.outputString(jdomDoc);
|
||||
|
||||
Assert.assertEquals(HEADER_FULL + expected + FOOTER + "\n", actual);
|
||||
}
|
||||
|
||||
protected void assertHTMLUsingJDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
|
||||
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
|
||||
|
||||
org.jdom2.Document document = ser.createJDom(cleaner.clean(input));
|
||||
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String actual = outputter.outputString(document);
|
||||
|
||||
Assert.assertEquals(HEADER_FULL + expected + FOOTER + "\n", actual);
|
||||
}
|
||||
|
||||
protected void assertHTMLIncludingHeaderUsingJDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
|
||||
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
|
||||
|
||||
org.jdom2.Document document = ser.createJDom(cleaner.clean(input));
|
||||
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String actual = outputter.outputString(document);
|
||||
|
||||
Assert.assertEquals(HEADER + expected + "\n", actual);
|
||||
}
|
||||
|
||||
protected String documentToString(
|
||||
final Document doc)
|
||||
{
|
||||
String ret = "";
|
||||
final TransformerFactory tf = TransformerFactory.newInstance();
|
||||
try
|
||||
{
|
||||
final Transformer transformer = tf.newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
|
||||
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
|
||||
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
|
||||
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
|
||||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
|
||||
final StringWriter stringWriter = new StringWriter();
|
||||
transformer.transform(new DOMSource(doc), new StreamResult(stringWriter));
|
||||
ret = stringWriter.getBuffer().toString();
|
||||
}
|
||||
catch (TransformerException e)
|
||||
{
|
||||
System.err.println("Failed to toString document " + e);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,37 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* @author patmoore
|
||||
*
|
||||
*/
|
||||
public class BadTerminationTest extends TestCase {
|
||||
|
||||
public void testHandleGarbageInEndTag() throws Exception {
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitHtmlEnvelope(true);
|
||||
cleanerProperties.setOmitXmlDeclaration(true);
|
||||
cleanerProperties.setUseEmptyElementTags(false);
|
||||
|
||||
String output = new SimpleXmlSerializer(cleanerProperties).getAsString( "<div></div id=\"foo\">");
|
||||
assertEquals("<div></div>", output);
|
||||
}
|
||||
|
||||
// public void testWhiteSpaceInTag() throws Exception {
|
||||
// String s =
|
||||
// "<html><body><table width=\"838\" cellpadding=\"5\" cellspacing=\"0\">\n"
|
||||
// +
|
||||
// " <tbody>\n" +
|
||||
// " <td width=\"704\"> </td>\n" +
|
||||
// " </tr\n" +
|
||||
// " ></tbody>< /table></bo dy>";
|
||||
// CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
// cleanerProperties.setOmitHtmlEnvelope(false);
|
||||
// cleanerProperties.setOmitXmlDeclaration(true);
|
||||
// cleanerProperties.setUseEmptyElementTags(false);
|
||||
// String output = new
|
||||
// SimpleXmlSerializer().getXmlAsString(cleanerProperties, s, "UTF-8");
|
||||
// assertEquals("<html><head></head><body><table width=\"838\" cellpadding=\"5\" cellspacing=\"0\"><tbody><tr><td width=\"704\"> </td></tr></tbody></table></body></html>",output);
|
||||
// }
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import junit.framework.*;
|
||||
|
||||
/**
|
||||
* Test cases for for {@link BrowserCompactXmlSerializer}
|
||||
*
|
||||
* @author Konstantin Burov (aectann@gmail.com)
|
||||
*
|
||||
*/
|
||||
public class BrowserCompactXmlSerializerTest extends TestCase {
|
||||
|
||||
private BrowserCompactXmlSerializer compactXmlSerializer;
|
||||
private CleanerProperties properties;
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
properties = new CleanerProperties();
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
compactXmlSerializer = new BrowserCompactXmlSerializer(properties);
|
||||
}
|
||||
|
||||
|
||||
public void testInlineWhitespaceHandling(){
|
||||
String cleaned = compactXmlSerializer.getAsString("<p>Test1 <a href=\"somelink\">Linktext</a> Test2</p>");
|
||||
assertEquals("<p>Test1 <a href=\"somelink\">Linktext</a> Test2</p>\n", cleaned);
|
||||
|
||||
cleaned = compactXmlSerializer.getAsString("<p>Test1<a href=\"somelink\">Linktext</a>Test2</p>");
|
||||
assertEquals("<p>Test1<a href=\"somelink\">Linktext</a>Test2</p>\n", cleaned);
|
||||
|
||||
cleaned = compactXmlSerializer.getAsString("one<br><b>two</b></br>three<b>four</b>");
|
||||
assertEquals("one<br /><b>two</b>three<b>four</b>", cleaned);
|
||||
|
||||
cleaned = compactXmlSerializer.getAsString("one<br><b>two</b></br>three <b>four</b>");
|
||||
assertEquals("one<br /><b>two</b>three <b>four</b>", cleaned);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that serializer removes white spaces properly.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testRemoveInsignificantWhitespaces() throws IOException{
|
||||
String cleaned = compactXmlSerializer.getAsString( " <u>text here, </u><b>some text</b> ");
|
||||
assertEquals("<u>text here, </u><b>some text</b>", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString( " <div class=\"foo\">2 roots < here > </div>");
|
||||
assertEquals("<div class=\"foo\">2 roots < here ></div>\n", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString( " <div class=\"foo\">2 roots \n < here > </div>");
|
||||
assertEquals("<div class=\"foo\">2 roots < here ></div>\n", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString( " <div class=\"foo\">2 roots \n\n < here > </div>");
|
||||
assertEquals("<div class=\"foo\">2 roots <br />< here ></div>\n", cleaned);
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-breakable spaces also must be removed from start and end.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testRemoveLeadingAndEndingNbsp() throws IOException {
|
||||
String cleaned = compactXmlSerializer.getAsString(
|
||||
" We have just released Jericho Road. Listen to Still Waters the lead-off track.");
|
||||
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString(
|
||||
" We have just released Jericho Road. Listen to Still Waters the lead-off track. ");
|
||||
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString(
|
||||
" We have just released Jericho Road. Listen to Still Waters the lead-off track. ");
|
||||
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString( SpecialEntities.NON_BREAKABLE_SPACE
|
||||
+ "We have just released Jericho Road. Listen to Still Waters the lead-off track. "
|
||||
+ SpecialEntities.NON_BREAKABLE_SPACE);
|
||||
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that contents of 'pre' tag are untouched.
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testPreTagIsUntouched() throws IOException{
|
||||
String cleaned = compactXmlSerializer.getAsString( " <pre>some text</pre>");
|
||||
assertEquals("<pre>some text</pre>\n", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString( "<pre> some text</pre>");
|
||||
assertEquals("<pre> some text</pre>\n", cleaned);
|
||||
cleaned = compactXmlSerializer.getAsString( "<pre>some /n/n text</pre>");
|
||||
assertEquals("<pre>some /n/n text</pre>\n", cleaned);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,604 @@
|
||||
/* Copyright (c) 2006-2013, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
public class CDATATest extends AbstractHtmlCleanerTest {
|
||||
|
||||
/**
|
||||
* Test for bug #189
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
public void UnclosedCDATA() throws Exception{
|
||||
String html = "<script><![CDATA[";
|
||||
String x = "";
|
||||
for (int i = 0; i < 2048; i++){x+="x";};
|
||||
html += x;
|
||||
html += "</script><p>Test</p>";
|
||||
|
||||
String expected = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script><p>Test</p>";
|
||||
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(true);
|
||||
assertCleaned(html, expected);
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test for bug #211
|
||||
* This passes, but is marked @Ignore because it takes a while to run. Comment
|
||||
* out ignore and run this test before making any builds.
|
||||
* @throws Exception
|
||||
*/
|
||||
@Ignore
|
||||
@Test
|
||||
public void UnclosedCDATA2() throws Exception{
|
||||
String html = "<script><![CDATA[";
|
||||
String x = "";
|
||||
for (int i = 0; i < 513*1024; i++){x+="x";};
|
||||
html += x;
|
||||
html += "</script><p>Test</p>";
|
||||
|
||||
String expected = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script><p>Test</p>";
|
||||
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(true);
|
||||
assertCleaned(html, expected);
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(false);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Test for bug #185
|
||||
//
|
||||
@Test
|
||||
public void noEndTokenLong() throws Exception{
|
||||
String initial = "<script><![CDATA[";
|
||||
String x = "";
|
||||
for (int i = 0; i < 1024; i++){x+="x";};
|
||||
String expected = "<script>"+x+"</script>";
|
||||
String expectedXml = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script>";
|
||||
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(true);
|
||||
assertCleanedHtml(initial+x, expected);
|
||||
assertCleaned(initial+x, expectedXml);
|
||||
assertCleanedJDom(initial+x, expectedXml);
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(false);
|
||||
}
|
||||
|
||||
//
|
||||
// Test for bug #189
|
||||
//
|
||||
@Test
|
||||
public void noEndTokenReallyLong() throws Exception{
|
||||
String initial = "<script><![CDATA[";
|
||||
String x = "";
|
||||
for (int i = 0; i < 4096; i++){x+="x";};
|
||||
String expected = "<script>"+x+"</script>";
|
||||
String expectedXml = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script>";
|
||||
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(true);
|
||||
assertCleanedHtml(initial+x, expected);
|
||||
assertCleaned(initial+x, expectedXml);
|
||||
assertCleanedJDom(initial+x, expectedXml);
|
||||
cleaner.getProperties().setOmitHtmlEnvelope(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* This is to test issue #134
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void strayEndTagInCDATA() throws IOException{
|
||||
String initial = readFile("src/test/resources/test31.html");
|
||||
cleaner.clean(initial);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we escape CDATA in regular HTML content
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void NotReallyCData() throws IOException{
|
||||
String initial = "<p><![CDATA ]]> is sometimes used in XML";
|
||||
String expected = "<html>\n<head />\n<body><p><![CDATA ]]> is sometimes used in XML</p></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a simple no-op test; when we use a HTML serializer we don't
|
||||
* automatically wrap the contents of script tags in a CDATA, as we do with
|
||||
* the XML serializers
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void NoCData() throws IOException{
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(true);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(true);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
cleanerProperties.setUseCdataForScriptAndStyle(true);
|
||||
this.cleaner = new HtmlCleaner(cleanerProperties);
|
||||
this.serializer = new SimpleHtmlSerializer(cleaner.getProperties());
|
||||
|
||||
String initial = "<html><head><script>function testNoOp(){<>}</script></head><body></body></html>";
|
||||
String expected = initial;
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* In this test the script has no CDATA, an unescaped CDATAsection in a
|
||||
* script tag, and there is also an incorrect CDATA declaration in a
|
||||
* paragraph tag.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void CDATAmixed() throws IOException{
|
||||
String initial = readFile("src/test/resources/test11.html");
|
||||
String expected = readFile("src/test/resources/test11_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void CDATAandDocType() throws IOException{
|
||||
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(false);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
this.cleaner = new HtmlCleaner(cleanerProperties);
|
||||
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
|
||||
|
||||
String initial = readFile("src/test/resources/test12.html");
|
||||
String expected = readFile("src/test/resources/test12_expected.html");
|
||||
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scriptAndCData() throws IOException
|
||||
{
|
||||
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(false);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
cleanerProperties.setAddNewlineToHeadAndBody(false);
|
||||
cleanerProperties.setUseCdataFor("script,style,altscript");
|
||||
this.cleaner = new HtmlCleaner(cleanerProperties);
|
||||
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
|
||||
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
|
||||
|
||||
assertHTMLWithHeader(
|
||||
"<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
|
||||
"<style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style>");
|
||||
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
|
||||
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n"
|
||||
+ "// \n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "// \n/*]]>*/"
|
||||
+ "</script>", "<script type=\"text/javascript\">\n"
|
||||
+ "// <![CDATA[\n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "// ]]>\n"
|
||||
+ "</script>");
|
||||
|
||||
assertHTML("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
|
||||
|
||||
assertHTML("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
|
||||
|
||||
assertHTML(
|
||||
"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
|
||||
"<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scriptAndCDataDom() throws IOException, ParserConfigurationException, Exception
|
||||
{
|
||||
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(false);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
cleanerProperties.setAddNewlineToHeadAndBody(false);
|
||||
cleanerProperties.setUseCdataFor("script,style,altscript");
|
||||
this.cleaner = new HtmlCleaner(cleanerProperties);
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
|
||||
|
||||
assertHTMLIncludingHeaderUsingJDomSerializer(
|
||||
"<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
|
||||
"<html><head><style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style></head></html>"
|
||||
);
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n"
|
||||
+ "// \n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "// \n/*]]>*/"
|
||||
+ "</script>", "<script type=\"text/javascript\">\n"
|
||||
+ "// <![CDATA[\n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "// ]]>\n"
|
||||
+ "</script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
|
||||
|
||||
assertHTMLUsingDomSerializer("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
|
||||
|
||||
assertHTMLUsingDomSerializer(
|
||||
"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
|
||||
"<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scriptAndCDataJDom() throws IOException, ParserConfigurationException
|
||||
{
|
||||
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(false);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
cleanerProperties.setAddNewlineToHeadAndBody(false);
|
||||
cleanerProperties.setUseCdataFor("script,style,altscript");
|
||||
this.cleaner = new HtmlCleaner(cleanerProperties);
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
|
||||
|
||||
assertHTMLIncludingHeaderUsingJDomSerializer("<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
|
||||
"<style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style>");
|
||||
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n"
|
||||
+ "// \n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "// \n/*]]>*/"
|
||||
+ "</script>", "<script type=\"text/javascript\">\n"
|
||||
+ "// <![CDATA[\n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "// ]]>\n"
|
||||
+ "</script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
|
||||
|
||||
assertHTMLUsingJDomSerializer("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
|
||||
|
||||
assertHTMLUsingJDomSerializer(
|
||||
"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
|
||||
"<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void escapingCDATA() throws IOException{
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(false);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
cleanerProperties.setAdvancedXmlEscape(true);
|
||||
cleanerProperties.setAddNewlineToHeadAndBody(false);
|
||||
cleanerProperties.setDeserializeEntities(true);
|
||||
cleanerProperties.setUseCdataFor("script,style,altscript");
|
||||
this.cleaner = new HtmlCleaner(cleanerProperties);
|
||||
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
|
||||
assertHTML("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
|
||||
assertHTML("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void removeCDATA() throws IOException{
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitCdataOutsideScriptAndStyle(true);
|
||||
cleanerProperties.setAddNewlineToHeadAndBody(false);
|
||||
cleanerProperties.setUseCdataFor("script,style,altscript");
|
||||
cleaner = new HtmlCleaner(cleanerProperties);
|
||||
serializer = new SimpleXmlSerializer(cleaner.getProperties());
|
||||
|
||||
// Verify that CDATA not inside SCRIPT or STYLE elements are considered comments in HTML and thus stripped
|
||||
// when cleaned.
|
||||
assertHTML("<p></p>", "<p><![CDATA[&]]></p>");
|
||||
assertHTML("<p>&&</p>", "<p>&<![CDATA[&]]>&</p>");
|
||||
assertHTML("<noaltscript />", "<noaltscript><![CDATA[&]]></noaltscript>");
|
||||
}
|
||||
|
||||
/**
|
||||
* Using the default setup, we should strip out CData outside
|
||||
* of script and style tags.
|
||||
*/
|
||||
@Test
|
||||
public void CDATAinthewrongplace(){
|
||||
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setIgnoreQuestAndExclam(true);
|
||||
|
||||
cleaner = new HtmlCleaner(cleanerProperties);
|
||||
|
||||
String testData = ""
|
||||
+ "<p>"
|
||||
+ "<![CDATA[\n"
|
||||
+ "function helloWorld() {\n"
|
||||
+ "};\n"
|
||||
+ "]]>\n"
|
||||
+ "</p>";
|
||||
|
||||
TagNode cleaned = cleaner.clean(testData);
|
||||
TagNode p = cleaned.findElementByName("p", true);
|
||||
|
||||
//
|
||||
// We should have no CData nodes, instead the contents should
|
||||
// be processed as content and escaped as usual
|
||||
//
|
||||
assertTrue(p.getAllChildren().get(0) instanceof ContentNode);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nonSafeCDATA(){
|
||||
String testData = ""
|
||||
+ "<script type=\"text/javascript\">"
|
||||
+ "<![CDATA[\n"
|
||||
+ "function helloWorld() {\n"
|
||||
+ "};\n"
|
||||
+ "]]>\n"
|
||||
+ "</script>";
|
||||
|
||||
TagNode cleaned = cleaner.clean(testData);
|
||||
TagNode script = cleaned.findElementByName("script", true);
|
||||
|
||||
|
||||
//
|
||||
// We should have a CData node for the CDATA section
|
||||
//
|
||||
assertTrue(script.getAllChildren().get(0) instanceof CData);
|
||||
CData cdata = (CData)script.getAllChildren().get(0);
|
||||
|
||||
String content = cdata.getContentWithoutStartAndEndTokens();
|
||||
assertEquals("\nfunction helloWorld() {\n};\n", content);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void safeOutput(){
|
||||
String testData = ""
|
||||
+ "<script type=\"text/javascript\">"
|
||||
+ "<![CDATA[\n"
|
||||
+ "function helloWorld() {\n"
|
||||
+ "};\n"
|
||||
+ "]]>\n"
|
||||
+ "</script>";
|
||||
|
||||
TagNode cleaned = cleaner.clean(testData);
|
||||
TagNode script = cleaned.findElementByName("script", true);
|
||||
|
||||
|
||||
//
|
||||
// We should have a CData node for the CDATA section
|
||||
//
|
||||
assertTrue(script.getAllChildren().get(0) instanceof CData);
|
||||
CData cdata = (CData)script.getAllChildren().get(0);
|
||||
|
||||
String content = cdata.getContentWithoutStartAndEndTokens();
|
||||
assertEquals("\nfunction helloWorld() {\n};\n", content);
|
||||
|
||||
String safeContent = cdata.getContentWithStartAndEndTokens();
|
||||
assertEquals("/*<![CDATA[*/\nfunction helloWorld() {\n};\n/*]]>*/", safeContent);
|
||||
}
|
||||
|
||||
/**
|
||||
* For a CDATA section we need to ignore '<' and '>' and keep going to keep the content
|
||||
* within a single CData instance.
|
||||
*/
|
||||
@Test
|
||||
public void safeCDATAAlternate(){
|
||||
String testData = ""
|
||||
+ "<script type=\"text/javascript\">\n"
|
||||
+ "//<![CDATA[\n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "//]]>\n"
|
||||
+ "</script>";
|
||||
|
||||
TagNode cleaned = cleaner.clean(testData);
|
||||
TagNode script = cleaned.findElementByName("script", true);
|
||||
|
||||
|
||||
//
|
||||
// We should have a CData node for the CDATA section
|
||||
//
|
||||
assertTrue(script.getAllChildren().get(1) instanceof CData);
|
||||
CData cdata = (CData)script.getAllChildren().get(1);
|
||||
|
||||
String content = cdata.getContentWithoutStartAndEndTokens();
|
||||
assertEquals("\nfunction escapeForXML(origtext) {\n return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n .replace(/>/g,'&'+'gt;').replace(/'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');}\n", content);
|
||||
}
|
||||
|
||||
/**
|
||||
* For a CDATA section we need to ignore '<' and '>' and keep going to keep the content
|
||||
* within a single CData instance
|
||||
*/
|
||||
@Test
|
||||
public void safeCDATA(){
|
||||
String testData = ""
|
||||
+ "<script type=\"text/javascript\">\n"
|
||||
+ "/*<![CDATA[*/\n"
|
||||
+ "function escapeForXML(origtext) {\n"
|
||||
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
|
||||
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
|
||||
+ "}\n"
|
||||
+ "/*]]>*/>\n"
|
||||
+ "</script>";
|
||||
|
||||
TagNode cleaned = cleaner.clean(testData);
|
||||
TagNode script = cleaned.findElementByName("script", true);
|
||||
|
||||
|
||||
//
|
||||
// We should have a CData node for the CDATA section
|
||||
//
|
||||
assertTrue(script.getAllChildren().get(1) instanceof CData);
|
||||
CData cdata = (CData)script.getAllChildren().get(1);
|
||||
|
||||
String content = cdata.getContentWithoutStartAndEndTokens();
|
||||
assertEquals("\nfunction escapeForXML(origtext) {\n return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n .replace(/>/g,'&'+'gt;').replace(/'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');}\n", content);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void style(){
|
||||
String testData = "<style type=\"text/css\">/*<![CDATA[*/\n#ampmep_188 { }\n/*]]>*/</style>";
|
||||
TagNode cleaned = cleaner.clean(testData);
|
||||
TagNode style = cleaned.findElementByName("style", true);
|
||||
|
||||
assertTrue(style.getAllChildren().get(0) instanceof CData);
|
||||
|
||||
String content = (((CData)style.getAllChildren().get(0)).getContentWithoutStartAndEndTokens());
|
||||
|
||||
assertEquals("\n#ampmep_188 { }\n", content);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveComments() throws IOException{
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
String initial = readFile("src/test/resources/test17.html");
|
||||
String expected = readFile("src/test/resources/test17_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveCommentsXwiki() throws IOException{
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
cleaner.getProperties().setAddNewlineToHeadAndBody(false);
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>"
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveComments2() throws IOException{
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
cleaner.getProperties().setAddNewlineToHeadAndBody(false);
|
||||
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//alert(\"Hello World\")\n/*]]>*/</script>",
|
||||
"<script type=\"text/javascript\"><![CDATA[//alert(\"Hello World\")]]></script>"
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Tests that tag closed due to one of its children (when the child tag is not allowed to be inside parent) is then
|
||||
* reopened.
|
||||
* Examples:
|
||||
* <pre>
|
||||
* <div><p>text1<table><tr><td>text2</td></tr></table>text3</p></div>
|
||||
* </pre>
|
||||
* table is not allowed inside a <p> most browsers handle this by placing the table close to line before and line after and in general allowing it.
|
||||
*
|
||||
* Cleaning here normally would result in :
|
||||
* <pre>
|
||||
* <div><p>text1<table><tr><td>text2</td></tr></table>text3</div>
|
||||
* </pre>
|
||||
* 'text3' is no longer inside the original element type ( 'p' ). Instead 'text3' is now within a 'div'.
|
||||
* text3 would no longer be styled correctly.
|
||||
*
|
||||
* A more correct result is:
|
||||
* <pre>
|
||||
* <div><p>text1<table><tr><td>text2</td></tr></table><p>text3</p></div>
|
||||
* </pre>
|
||||
*/
|
||||
public class ClosedTagReopenTest extends TestCase {
|
||||
|
||||
public void testSimpleHTML4() throws IOException {
|
||||
CleanerProperties properties = new CleanerProperties();
|
||||
properties.setHtmlVersion(HtmlCleaner.HTML_4);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
|
||||
String[][] tests= {
|
||||
new String[] { "<p>text1<table><tr><td>text2</td></tr></table>text3</p>", "<p>text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p>text3</p>" },
|
||||
new String[] {"</p>text1","text1"},
|
||||
new String[] {"<p>text1<div>text2</div>text3</p>", "<p>text1</p><div>text2</div><p>text3</p>"},
|
||||
new String[] { "<div>text1<p>text2</p>text3</div>", "<div>text1<p>text2</p>text3</div>"},
|
||||
new String[] {"<font>text1<p>text2</p>text3</font>", "<font>text1</font><p><font>text2</font></p><font>text3</font>"},
|
||||
new String[] {"<p>text1<div>text2</div>text3<div>text4</div></p>", "<p>text1</p><div>text2</div><p>text3</p><div>text4</div>"},
|
||||
new String[] {"<p>text1<div>text2</div></p>", "<p>text1</p><div>text2</div>"},
|
||||
new String[] {"<p>text1<p>text2</p></p>", "<p>text1</p><p>text2</p>"},
|
||||
//test multiple internal breaks
|
||||
new String[] {"<p><div>text1<p>text2<div>text3<p>text4<div>text5</div></p></div></p></div>","<p></p><div>text1<p>text2</p><div>text3<p>text4</p><div>text5</div></div></div>"},
|
||||
// test attribute preservation
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2</td></tr></table>text3</p>",
|
||||
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
|
||||
// but not all attributes ( id attribute must be unique )
|
||||
// TODO: maybe a generated id so that correlation can be found?
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1<table><tr><td>text2</td></tr></table>text3</p>",
|
||||
"<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
|
||||
// test multiple replacements
|
||||
// test to see if nested good <p> can be handled.
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p>text2a</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
|
||||
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p>text2a</p></td></tr></tbody></table>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
|
||||
"<ul><li>text4</li></ul>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
|
||||
"<ul><li>text6</li></ul>" },
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p class=\"another_p_element\">text2a<div>test2b</div>test2c</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
|
||||
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p class=\"another_p_element\">text2a</p><div>test2b</div><p class=\"another_p_element\">test2c</p></td></tr></tbody></table>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
|
||||
"<ul><li>text4</li></ul>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
|
||||
"<ul><li>text6</li></ul>" },
|
||||
new String[]{"<p>text1<table><tr><td>text2<tr><td>text3</table>text4</p>","<p>text1</p><table><tbody><tr><td>text2</td></tr><tr><td>text3</td></tr></tbody></table><p>text4</p>"}
|
||||
};
|
||||
for(String[] test: tests) {
|
||||
String cleaned = serializer.getAsString(test[0]);
|
||||
assertEquals("started with="+test[0], test[1], cleaned);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleHTML5() throws IOException {
|
||||
CleanerProperties properties = new CleanerProperties();
|
||||
properties.setHtmlVersion(HtmlCleaner.HTML_5);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
|
||||
String[][] tests= {
|
||||
new String[] { "<p>text1<table><tr><td>text2</td></tr></table>text3</p>", "<p>text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p>text3</p>" },
|
||||
new String[] {"</p>text1","text1"},
|
||||
new String[] {"<p>text1<div>text2</div>text3</p>", "<p>text1</p><div>text2</div><p>text3</p>"},
|
||||
new String[] { "<div>text1<p>text2</p>text3</div>", "<div>text1<p>text2</p>text3</div>"},
|
||||
new String[] {"text1<p>text2</p>text3", "text1<p>text2</p>text3"},
|
||||
new String[] {"<p>text1<div>text2</div>text3<div>text4</div></p>", "<p>text1</p><div>text2</div><p>text3</p><div>text4</div>"},
|
||||
new String[] {"<p>text1<div>text2</div></p>", "<p>text1</p><div>text2</div>"},
|
||||
new String[] {"<p>text1<p>text2</p></p>", "<p>text1</p><p>text2</p>"},
|
||||
//test multiple internal breaks
|
||||
new String[] {"<p><div>text1<p>text2<div>text3<p>text4<div>text5</div></p></div></p></div>","<p></p><div>text1<p>text2</p><div>text3<p>text4</p><div>text5</div></div></div>"},
|
||||
// test attribute preservation
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2</td></tr></table>text3</p>",
|
||||
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
|
||||
// but not all attributes ( id attribute must be unique )
|
||||
// TODO: maybe a generated id so that correlation can be found?
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1<table><tr><td>text2</td></tr></table>text3</p>",
|
||||
"<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
|
||||
// test multiple replacements
|
||||
// test to see if nested good <p> can be handled.
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p>text2a</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
|
||||
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p>text2a</p></td></tr></tbody></table>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
|
||||
"<ul><li>text4</li></ul>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
|
||||
"<ul><li>text6</li></ul>" },
|
||||
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p class=\"another_p_element\">text2a<div>test2b</div>test2c</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
|
||||
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p class=\"another_p_element\">text2a</p><div>test2b</div><p class=\"another_p_element\">test2c</p></td></tr></tbody></table>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
|
||||
"<ul><li>text4</li></ul>" +
|
||||
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
|
||||
"<ul><li>text6</li></ul>" },
|
||||
new String[]{"<p>text1<table><tr><td>text2<tr><td>text3</table>text4</p>","<p>text1</p><table><tbody><tr><td>text2</td></tr><tr><td>text3</td></tr></tbody></table><p>text4</p>"}
|
||||
};
|
||||
for(String[] test: tests) {
|
||||
String cleaned = serializer.getAsString(test[0]);
|
||||
assertEquals("started with="+test[0], test[1], cleaned);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,215 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.htmlcleaner.conditional.TagNodeEmptyContentCondition;
|
||||
import org.htmlcleaner.conditional.TagNodeInsignificantBrCondition;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Various tests for collapseNullHtml mode.
|
||||
*/
|
||||
public class CollapseHtmlTest extends TestCase {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final String CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR = "<tr><td></td><td>Cannot eliminate anything in this row</td></tr>";
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final String IMAGE = "<img src=\"http://localhost:8080/img/foo.jpg\" />";
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final String DONT_COLLAPSE = "<span>" + IMAGE + "</span>" + "<p>" + IMAGE + "</p>"
|
||||
+ "<p>bar<table><tr><td></td><td>" + IMAGE + "</td><td> </td></tr></table>foo</p>";
|
||||
private static final String DONT_COLLAPSE_OUTPUT = "<span>" + IMAGE + "</span>" + "<p>" + IMAGE + "</p>"
|
||||
+ "<p>bar</p><table><tbody><tr><td></td><td>" + IMAGE + "</td><td> </td></tr></tbody></table><p>foo</p>";
|
||||
private HtmlCleaner cleaner;
|
||||
|
||||
private CleanerProperties properties;
|
||||
|
||||
private SimpleXmlSerializer serializer;
|
||||
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
cleaner = new HtmlCleaner();
|
||||
properties = cleaner.getProperties();
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
serializer = new SimpleXmlSerializer(properties);
|
||||
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
|
||||
properties.addPruneTagNodeCondition(new TagNodeInsignificantBrCondition());
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure that single empty tag is dropped out.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testCollapseSingleEmptyTag() throws IOException {
|
||||
TagNode collapsed = cleaner.clean("<u></u>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* Make sure that tags with internal blanks are collapsed.
|
||||
*/
|
||||
public void testCollapseSingleTagWithBlanks() throws IOException {
|
||||
TagNode collapsed = cleaner.clean("<u> </u>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<u>   </u>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
// Strange msword insert
|
||||
// collapsed =
|
||||
// cleaner.clean("<span style='mso-spacerun:yes'> </span>");
|
||||
// assertEquals("", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure that non-breaking spaces are also collapsed away.
|
||||
*/
|
||||
public void testCollapseSingleTagWithNbsp() throws IOException {
|
||||
TagNode collapsed = cleaner.clean("<u> </u>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<u>   </u>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<u>   </u>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<u> " + SpecialEntities.NON_BREAKABLE_SPACE + " </u>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure that multiple null tags are collapsed.
|
||||
*/
|
||||
public void testCollapseMultipleEmptyTags() throws IOException {
|
||||
TagNode collapsed = cleaner.clean("<b><i><u></u></i></b>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
|
||||
// test with slightly bad html.
|
||||
collapsed = cleaner.clean("<b><i><u></i></u></b>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
// test with slightly bad html.
|
||||
collapsed = cleaner.clean("<b><i><u></i></u>notme</b>");
|
||||
assertEquals("<b>notme</b>", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure that insignificant br tags are collapsed
|
||||
*/
|
||||
public void testCollapseInsignificantBr() throws IOException {
|
||||
TagNode collapsed = cleaner.clean("<p><br/>Some text</p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<p>Some text<BR/></p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<p><br/>Some<br/> text<br/></p>");
|
||||
assertEquals("<p>Some<br /> text</p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<p><br/><br/>Some text <i>look here</i></p>");
|
||||
assertEquals("<p>Some text <i>look here</i></p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("Some text<BR/>");
|
||||
assertEquals("Some text", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure TagTransformations do not interfere with collapse
|
||||
*/
|
||||
public void testCollapseEmptyWithTagTransformations() throws IOException {
|
||||
CleanerTransformations transformations = properties.getCleanerTransformations();
|
||||
TagTransformation t = new TagTransformation("font", "span", true);
|
||||
t.addAttributeTransformation("style", "${style};font-family:${face};font-size:${size};color:${color};");
|
||||
t.addAttributeTransformation("face");
|
||||
t.addAttributeTransformation("size");
|
||||
t.addAttributeTransformation("color");
|
||||
t.addAttributeTransformation("name", "${face}_1");
|
||||
transformations.addTransformation(t);
|
||||
TagNode collapsed = cleaner.clean("<b><font face=\"Ariel\"><u></u></font></b>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* test to make sure that multiple <br>
|
||||
* elements are eliminated
|
||||
*/
|
||||
public void testChainCollapseInsignificantBrs() throws IOException {
|
||||
TagNode collapsed = cleaner.clean("<p><br/><br>Some<br>text<br/><br><br></p>");
|
||||
assertEquals("<p>Some<br />text</p>", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure that intervening empty elements still cause unneeded <br>
|
||||
* s to be eliminated.
|
||||
*/
|
||||
public void testCollapseInsignificantBrWithEmptyElementsHTML4() throws IOException {
|
||||
properties.setHtmlVersion(HtmlCleaner.HTML_4);
|
||||
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
|
||||
TagNode collapsed = cleaner.clean("<p><span> </span><br/>Some text</p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u><big></big></u><BR/></p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u><big></big></u><BR/><u></u></p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
|
||||
}
|
||||
|
||||
public void testCollapseInsignificantBrWithEmptyElementsHTML5() throws IOException {
|
||||
properties.setHtmlVersion(HtmlCleaner.HTML_5);
|
||||
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
|
||||
TagNode collapsed = cleaner.clean("<p><span> </span><br/>Some text</p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u></u><BR/></p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u></u><BR/><u></u></p>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Br nested in formating elements should be eliminated.
|
||||
*/
|
||||
public void testInsureMeaninglessBrsStillCollapseEmptyElementsHTML4() throws IOException {
|
||||
properties.setHtmlVersion(HtmlCleaner.HTML_4);
|
||||
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
|
||||
TagNode collapsed;
|
||||
collapsed = cleaner.clean("<p><u><br/></u>Some text<br><span><BR/><u><big><BR/></big></u></p></span>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
|
||||
public void testInsureMeaninglessBrsStillCollapseEmptyElementsHTML5() throws IOException {
|
||||
properties.setHtmlVersion(HtmlCleaner.HTML_5);
|
||||
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
|
||||
TagNode collapsed;
|
||||
collapsed = cleaner.clean("<p><u><br/></u>Some text<br><span><BR/><u><BR/></u></p></span>");
|
||||
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
/**
|
||||
* because elements with ids can be referred to by javascript, don't assume
|
||||
* that such elements can be eliminated.
|
||||
*/
|
||||
public void testCollapseOnlyFormattingElementsWithNoIds() throws IOException {
|
||||
TagNode collapsed = cleaner.clean("<b id=\"notme\"></b><span></span><span id=\"norme\"></span>");
|
||||
assertEquals("<b id=\"notme\"></b><span id=\"norme\"></span>", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean("<b iD=\"notme\"></b><span></span><span ID=\"norme\"></span>");
|
||||
assertEquals("<b id=\"notme\"></b><span id=\"norme\"></span>", serializer.getAsString(collapsed));
|
||||
}
|
||||
|
||||
public void testCollapseAggressively() throws IOException {
|
||||
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
|
||||
TagNode collapsed;
|
||||
collapsed = cleaner.clean("<p><table><tr></tr><tr><td></td></tr></table></p>");
|
||||
assertEquals("", serializer.getAsString(collapsed));
|
||||
collapsed = cleaner.clean(DONT_COLLAPSE);
|
||||
assertEquals(DONT_COLLAPSE_OUTPUT, serializer.getAsString(collapsed));
|
||||
collapsed = cleaner
|
||||
.clean("<p id=\"notme\"></p><table><tr></tr><tr><td>Nor me</td></tr><tr><td></td></tr><tr> </tr>"
|
||||
+ "<tr> \n</tr>" + CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR + "</table>");
|
||||
assertEquals("<p id=\"notme\"></p><table><tbody><tr><td>Nor me</td></tr>"
|
||||
+ CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR + "</tbody></table>", serializer.getAsString(collapsed));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
|
||||
/**
|
||||
* Testing HtmlCleaner constructors.
|
||||
*/
|
||||
public class ConstructorTest extends TestCase {
|
||||
|
||||
public void testPropertiesConstructor() throws Exception {
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitComments(true);
|
||||
|
||||
HtmlCleaner cleaner1 = new HtmlCleaner(props);
|
||||
TagNode node1 = cleaner1.clean("<a href=index.htm><b><!--COMMENT 1--><b>text text<body>");
|
||||
assertTrue( new SimpleXmlSerializer(props).getAsString(node1).indexOf("<!--COMMENT 1-->") < 0 );
|
||||
|
||||
HtmlCleaner cleaner2 = new HtmlCleaner(props);
|
||||
TagNode node2 = cleaner2.clean("<span href=index1.htm><b><!--COMMENT 2--><x>DDDD text<body>");
|
||||
assertTrue( new SimpleXmlSerializer(props).getAsString(node2).indexOf("<!--COMMENT 2-->") < 0 );
|
||||
|
||||
HtmlCleaner cleaner3 = new HtmlCleaner(props);
|
||||
props.setOmitComments(false);
|
||||
TagNode node3 = cleaner3.clean("<a href=index3.htm><b><!--COMMENT 3--><x>EEEEEEE text<body>");
|
||||
assertTrue( new SimpleXmlSerializer(props).getAsString(node3).indexOf("<!--COMMENT 3-->") > 0 );
|
||||
|
||||
TagNode node4 = cleaner3.clean( new ByteArrayInputStream( ("FIRST" + (char)0x2 + (char)0x3 + "SECOND").getBytes() ), "ASCII" );
|
||||
assertTrue( new CompactXmlSerializer(props).getAsString(node4).indexOf("FIRST SECOND") >= 0 );
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,453 @@
|
||||
/* Copyright (c) 2006-2013, HtmlCleaner project team (Vladimir Nikic, Scott Wilson, Pat Moore)
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
You can contact Vladimir Nikic by sending e-mail to
|
||||
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
|
||||
subject line.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.w3c.dom.Document;
|
||||
|
||||
public class DocTypesTest extends AbstractHtmlCleanerTest{
|
||||
|
||||
|
||||
@Test
|
||||
public void DocTypeUsingDom() throws IOException, ParserConfigurationException{
|
||||
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(false);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
cleaner = new HtmlCleaner(cleanerProperties);
|
||||
|
||||
DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
|
||||
String initial = readFile("src/test/resources/test12.html");
|
||||
TagNode cleaned = cleaner.clean(initial);
|
||||
|
||||
Document doc = domSerializer.createDOM(cleaned);
|
||||
|
||||
assertEquals("html", doc.getDoctype().getName());
|
||||
assertEquals("-//W3C//DTD XHTML 1.0 Strict//EN", doc.getDoctype().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", doc.getDoctype().getSystemId());
|
||||
}
|
||||
|
||||
// TODO remove and make this class a subclass of AbstractHtmlCleanerTest
|
||||
protected String readFile(String filename) throws IOException {
|
||||
File file = new File(filename);
|
||||
CharSequence content = Utils.readUrl(file.toURI().toURL(), "UTF-8");
|
||||
return content.toString();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void none() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE><html><body></body></html>");
|
||||
assertEquals(null, cleaned.getDocType().getPart1());
|
||||
assertEquals(null, cleaned.getDocType().getPart2());
|
||||
assertEquals("", cleaned.getDocType().getPublicId());
|
||||
assertEquals("", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
serializer = new SimpleHtmlSerializer(cleaner.getProperties());
|
||||
String out = serializer.getAsString(cleaned);
|
||||
assertEquals(out, "<!DOCTYPE>\n<html><head></head><body></body></html>");
|
||||
|
||||
}
|
||||
|
||||
//
|
||||
// Check all the valid doctypes
|
||||
//
|
||||
|
||||
@Test
|
||||
public void html_5() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
|
||||
assertEquals("html", cleaned.getDocType().getPart1());
|
||||
assertEquals(null, cleaned.getDocType().getPart2());
|
||||
assertEquals("", cleaned.getDocType().getPublicId());
|
||||
assertEquals("", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML5, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_5_upper() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals(null, cleaned.getDocType().getPart2());
|
||||
assertEquals("", cleaned.getDocType().getPublicId());
|
||||
assertEquals("", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML5, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_5_legacy() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM \"about:legacy-compat\"><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("SYSTEM", cleaned.getDocType().getPart2());
|
||||
assertEquals("about:legacy-compat", cleaned.getDocType().getPublicId());
|
||||
assertEquals("", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML5_LEGACY_TOOL_COMPATIBLE, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_5_legacy_alternate() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM 'about:legacy-compat'><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("SYSTEM", cleaned.getDocType().getPart2());
|
||||
assertEquals("about:legacy-compat", cleaned.getDocType().getPublicId());
|
||||
assertEquals("", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML5_LEGACY_TOOL_COMPATIBLE, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_0() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD HTML 4.0//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_0_strict() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\"><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD HTML 4.0//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/REC-html40/strict.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_strict_identifierOnly() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_strict_mixed() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\" SYSTEM \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
|
||||
assertEquals("html", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/html4/strict.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_strict() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/html4/strict.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_transitional() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD HTML 4.01 Transitional//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/html4/loose.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML4_01_TRANSITIONAL, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_frameset() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\"><html><body></body></html>");
|
||||
assertEquals("HTML", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD HTML 4.01 Frameset//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/html4/frameset.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.HTML4_01_FRAMESET, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_strict() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
|
||||
assertEquals("html", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD XHTML 1.0 Strict//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.XHTML1_0_STRICT, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_transitional() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html><body></body></html>");
|
||||
assertEquals("html", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD XHTML 1.0 Transitional//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.XHTML1_0_TRANSITIONAL, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_frameset() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Frameset//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd\"><html><body></body></html>");
|
||||
assertEquals("html", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD XHTML 1.0 Frameset//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.XHTML1_0_FRAMESET, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_1() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html><body></body></html>");
|
||||
assertEquals("html", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD XHTML 1.1//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_1_basic() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
|
||||
assertEquals("html", cleaned.getDocType().getPart1());
|
||||
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
|
||||
assertEquals("-//W3C//DTD XHTML Basic 1.1//EN", cleaned.getDocType().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd", cleaned.getDocType().getSystemId());
|
||||
assertEquals(DoctypeToken.XHTML1_1_BASIC, cleaned.getDocType().getType());
|
||||
assertTrue(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
//
|
||||
// Now some invalid ones
|
||||
//
|
||||
|
||||
@Test
|
||||
public void empty() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void not_html() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE banana><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_0_wrong_id_type() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM \"-//W3C//DTD HTML 4.0//EN\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_0_wrong_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_wrong_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_transitional_bad_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.HTML4_01_TRANSITIONAL, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_frameset_bad_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.HTML4_01_FRAMESET, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_0_with_wrong_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.XHTML1_0_STRICT, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_0_transitional_with_wrong_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.XHTML1_0_TRANSITIONAL, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_0_frameset_with_wrong_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Frameset//EN\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.XHTML1_0_FRAMESET, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_1_with_wrong_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_1_with_no_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"><html><body></body></html>");
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_1_basic_with_no_id() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.XHTML1_1_BASIC, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void weird_token() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html SILLY \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
|
||||
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
|
||||
assertFalse(cleaned.getDocType().isValid());
|
||||
}
|
||||
|
||||
//
|
||||
// Serializer
|
||||
//
|
||||
|
||||
@Test
|
||||
public void html_4_01_serialize() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
|
||||
String output = serializer.getAsString(cleaned);
|
||||
assertTrue(output.startsWith("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_domserialize() throws IOException, ParserConfigurationException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
|
||||
DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
|
||||
Document doc = domSerializer.createDOM(cleaned);
|
||||
assertEquals("html", doc.getDocumentElement().getNodeName());
|
||||
assertEquals("HTML", doc.getDoctype().getName());
|
||||
assertEquals("-//W3C//DTD HTML 4.01//EN", doc.getDoctype().getPublicId());
|
||||
assertEquals("http://www.w3.org/TR/html4/strict.dtd", doc.getDoctype().getSystemId());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html_4_01_case_correct() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
|
||||
String output = serializer.getAsString(cleaned);
|
||||
assertTrue(output.startsWith("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_1_serialize() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
|
||||
String output = serializer.getAsString(cleaned);
|
||||
assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\">"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_0_strict_serialize() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
|
||||
String output = serializer.getAsString(cleaned);
|
||||
assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void xhtml_1_0_strict_serialize_case_correct() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
|
||||
String output = serializer.getAsString(cleaned);
|
||||
assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html5_serialize() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
|
||||
String output = serializer.getAsString(cleaned);
|
||||
assertTrue(output.startsWith("<!DOCTYPE html>"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void html5_serialize_case_correct() throws IOException{
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML><html><body></body></html>");
|
||||
String output = serializer.getAsString(cleaned);
|
||||
assertTrue(output.startsWith("<!DOCTYPE html>"));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Misc
|
||||
//
|
||||
|
||||
@Test
|
||||
public void checkToString(){
|
||||
TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
|
||||
assertEquals(cleaned.getDocType().getContent(), cleaned.getDocType().toString());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,380 @@
|
||||
/* Copyright (c) 2006-2019, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.jdom2.input.DOMBuilder;
|
||||
import org.jdom2.output.Format;
|
||||
import org.jdom2.output.XMLOutputter;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.w3c.dom.Document;
|
||||
|
||||
public class DomSerializerTest extends AbstractHtmlCleanerTest {
|
||||
|
||||
@Test
|
||||
public void removeInvalidTags3() throws Exception{
|
||||
String html="<p><^-^></p>";
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
final Document doc = new DomSerializer(cleanerProperties).createDOM(tagNode);
|
||||
assertEquals("<^-^>", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void attributeCharacters() throws Exception{
|
||||
String html="<p dispariție='dispariție.'></p>";
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setAllowInvalidAttributeNames(false);
|
||||
DomSerializer ser = new DomSerializer(cleanerProperties);
|
||||
ser.setXmlVersion("1.1");
|
||||
final Document doc = ser.createDOM(tagNode);
|
||||
assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void attributeCharactersEncoded() throws Exception{
|
||||
String html="<p dispari\u021bie='dispari\u021bie.'></p>";
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setAllowInvalidAttributeNames(false);
|
||||
DomSerializer ser = new DomSerializer(cleanerProperties);
|
||||
ser.setXmlVersion("1.1");
|
||||
final Document doc = ser.createDOM(tagNode);
|
||||
assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void attributeCharacters2() throws Exception{
|
||||
String html="<p t%st='dispariție.'></p>";
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setAllowInvalidAttributeNames(false);
|
||||
final Document doc = new DomSerializer(cleanerProperties).createDOM(tagNode);
|
||||
assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
|
||||
}
|
||||
|
||||
// See bug #203
|
||||
@Test
|
||||
public void parse2() throws Exception
|
||||
{
|
||||
String html = "<div foo=\"aaa"bbb&ccc>ddd<eee\">content</div>";
|
||||
String expected = "<div foo=\"aaa"bbb&ccc>ddd<eee\">content</div>";
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
cleanerProperties.setOmitHtmlEnvelope(true);
|
||||
cleanerProperties.setOmitXmlDeclaration(true);
|
||||
String out = new SimpleXmlSerializer(cleanerProperties).getAsString(html);
|
||||
assertEquals(expected, out);
|
||||
}
|
||||
|
||||
// See bug #212
|
||||
@Test
|
||||
public void parse() throws Exception
|
||||
{
|
||||
String html = "<?xml version = \"1.0\"?><img src=\"http://xwiki.org?a=&b\"/>";
|
||||
String expected = "<img src=\"http://xwiki.org?a=&b\" />";
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final Document doc = new DomSerializer(cleanerProperties, true).createDOM(tagNode);
|
||||
assertEquals("http://xwiki.org?a=&b",
|
||||
doc.getElementsByTagName("img").item(0).getAttributes().getNamedItem("src").getTextContent());
|
||||
cleanerProperties.setOmitHtmlEnvelope(true);
|
||||
cleanerProperties.setOmitXmlDeclaration(true);
|
||||
String out = new SimpleXmlSerializer(cleanerProperties).getAsString(html);
|
||||
assertEquals(expected, out);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void removeInvalidTags() throws Exception{
|
||||
String html="<p><^-^></p>";
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
|
||||
assertEquals("<^-^>", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void removeInvalidTags2() throws Exception{
|
||||
String html="<p><1o/></p>";
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
|
||||
assertEquals("<1o/>", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void detectUnicodeSpaces() throws Exception{
|
||||
String html="<meta\u00A0property=\"test\" content=\"value\">";
|
||||
String expectedOutput= "test";
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
|
||||
assertEquals(expectedOutput, doc.getElementsByTagName("meta").item(0).getAttributes().getNamedItem("property").getTextContent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveUnicodeTest() throws Exception
|
||||
{
|
||||
final String nonAsciiWord = "hemförsäkring";
|
||||
final String html = "<html>"
|
||||
+ "<body>"
|
||||
+ "<p>"
|
||||
+ nonAsciiWord
|
||||
+ "</p>"
|
||||
+ "</body>"
|
||||
+ "</html>";
|
||||
|
||||
final String expectedOutput =
|
||||
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"
|
||||
+ "<html>\n" +
|
||||
" <head/>\n" +
|
||||
" <body>\n" +
|
||||
" <p>" + nonAsciiWord + "</p>\n" +
|
||||
" </body>\n" +
|
||||
"</html>\n"
|
||||
+ "";
|
||||
|
||||
final TagNode tagNode = new HtmlCleaner().clean(html);
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
|
||||
assertEquals(expectedOutput, documentToString(doc));
|
||||
}
|
||||
|
||||
// See Bug #215
|
||||
@Test
|
||||
public void invalidXMLElementName() throws ParserConfigurationException{
|
||||
|
||||
final String HTML = "<img srcset=\"<p%20\">";
|
||||
|
||||
final CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
//
|
||||
// When we set allow to true, then we parse the attribute value as text
|
||||
//
|
||||
cleanerProperties.setAllowHtmlInsideAttributes(true);
|
||||
TagNode tagNode = new HtmlCleaner(cleanerProperties).clean(HTML);
|
||||
assertEquals(tagNode.getChildTags()[1].getChildTags()[0].getAttributeByName("srcset"),"<p%20");
|
||||
//
|
||||
// When we set allow to false, then we identify tags in attribute as new tags, and break
|
||||
// into a new tag
|
||||
//
|
||||
cleanerProperties.setAllowHtmlInsideAttributes(false);
|
||||
tagNode = new HtmlCleaner(cleanerProperties).clean(HTML);
|
||||
|
||||
//
|
||||
// Not an issue for HTML, which accepts pretty much anything in a tag name
|
||||
//
|
||||
cleanerProperties.setOmitXmlDeclaration(true);
|
||||
String output = new SimpleHtmlSerializer(cleanerProperties).getAsString(tagNode);
|
||||
assertEquals("<html><head></head><body><img srcset=\"\" /><p%20></p%20></body></html>", output);
|
||||
|
||||
//
|
||||
// But for XML DOM, we must follow the rules for building valid names, which means
|
||||
// getting rid of the % sign
|
||||
//
|
||||
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
|
||||
assertEquals(1, doc.getDocumentElement().getElementsByTagName("p20").getLength());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void errorChecking() throws ParserConfigurationException{
|
||||
TagNode node = cleaner.clean("<p>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true, true, false);
|
||||
Document document = ser.createDocument(node);
|
||||
assertFalse(document.getStrictErrorChecking());
|
||||
}
|
||||
|
||||
/**
|
||||
* See issue 108
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
@Ignore
|
||||
public void html5doctype() throws Exception{
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
|
||||
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
|
||||
String initial = readFile("src/test/resources/test23.html");
|
||||
TagNode tagNode = cleaner.clean(initial);
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties());
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
assertNotNull(dom.getChildNodes().item(0).getChildNodes().item(0));
|
||||
assertEquals("head", dom.getChildNodes().item(0).getChildNodes().item(0).getNodeName());
|
||||
}
|
||||
|
||||
/**
|
||||
* See issue 127
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void rootNodeAttributes() throws Exception{
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
|
||||
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
|
||||
String initial = readFile("src/test/resources/test29.html");
|
||||
TagNode tagNode = cleaner.clean(initial);
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties());
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
assertNotNull(dom.getChildNodes().item(0).getChildNodes().item(0));
|
||||
assertEquals("http://unknown.namespace.com", dom.getChildNodes().item(0).getAttributes().getNamedItem("xmlns").getNodeValue());
|
||||
assertEquals("27", dom.getChildNodes().item(0).getAttributes().getNamedItem("id").getNodeValue());
|
||||
//
|
||||
// Check we have a real ID attribute in the DOM and not just a regular attribute
|
||||
//
|
||||
assertEquals("http://unknown.namespace.com", dom.getElementById("27").getAttribute("xmlns"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cdata() throws Exception{
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
|
||||
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
|
||||
String initial = "<script> this > that </script>";
|
||||
TagNode tagNode = cleaner.clean(initial);
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), true);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
DOMBuilder in = new DOMBuilder();
|
||||
org.jdom2.Document jdomDoc = in.build(dom);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String actual = outputter.outputString(jdomDoc);
|
||||
Assert.assertTrue(actual.contains("this > that"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cdata2() throws Exception{
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
|
||||
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
|
||||
String initial = "<script> this > that </script>";
|
||||
TagNode tagNode = cleaner.clean(initial);
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), false);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
DOMBuilder in = new DOMBuilder();
|
||||
org.jdom2.Document jdomDoc = in.build(dom);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String actual = outputter.outputString(jdomDoc);
|
||||
Assert.assertTrue(actual.contains("this > that"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void escaping() throws Exception {
|
||||
cleaner.getProperties().setTranslateSpecialEntities(true);
|
||||
cleaner.getProperties().setAdvancedXmlEscape(true);
|
||||
TagNode tagNode = cleaner.clean("<div>£, £ and £</div>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
|
||||
Assert.assertEquals(("£, £ and £"),actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void escaping_2() throws Exception {
|
||||
cleaner.getProperties().setTranslateSpecialEntities(false);
|
||||
TagNode tagNode = cleaner.clean("<div>£, £ and £</div>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), false);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
|
||||
Assert.assertEquals(("£, £ and £"),actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void escaping_3() throws Exception {
|
||||
cleaner.getProperties().setTranslateSpecialEntities(false);
|
||||
TagNode tagNode = cleaner.clean("<div>£, £ and £</div>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
|
||||
Assert.assertEquals(("£, £ and £"),actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void escaping_4() throws Exception {
|
||||
cleaner.getProperties().setRecognizeUnicodeChars(false);
|
||||
TagNode tagNode = cleaner.clean("<div>£, £ and £</div>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
|
||||
Assert.assertEquals(("£, £ and £"),actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void escapingReservedCharactersTest() throws Exception {
|
||||
cleaner.getProperties().setRecognizeUnicodeChars(false);
|
||||
TagNode tagNode = cleaner.clean("<div>\" < > &</div>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
|
||||
Assert.assertEquals(("" < > &"),actual);
|
||||
}
|
||||
|
||||
//
|
||||
// We shouldn't escape any characters in a comment
|
||||
//
|
||||
@Test
|
||||
public void escapingCommentsTest() throws Exception {
|
||||
cleaner.getProperties().setRecognizeUnicodeChars(false);
|
||||
TagNode tagNode = cleaner.clean("<div><!--\" \' < > &--></div>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
String actual = dom.getElementsByTagName("div").item(0).getChildNodes().item(0).getTextContent();
|
||||
Assert.assertEquals(("\" \' < > &"),actual);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void ncr() throws Exception {
|
||||
|
||||
cleaner.getProperties().setOmitComments(true);
|
||||
cleaner.getProperties().setNamespacesAware(false);
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
|
||||
cleaner.getProperties().setTranslateSpecialEntities(true);
|
||||
|
||||
TagNode tagNode = cleaner.clean("<div> ’ ж ý ÷ ÷ </div>");
|
||||
DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), false);
|
||||
Document dom = ser.createDOM(tagNode);
|
||||
DOMBuilder in = new DOMBuilder();
|
||||
org.jdom2.Document jdomDoc = in.build(dom);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String actual = outputter.outputString(jdomDoc);
|
||||
|
||||
Assert.assertTrue(actual.contains("’ ж ý ÷ ÷"));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,83 @@
|
||||
/* Copyright (c) 2006-2014, the HtmlCleaner project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class EntityDeserializationTest extends TestCase {
|
||||
|
||||
private HtmlCleaner cleaner;
|
||||
|
||||
@Override
|
||||
public void setUp() {
|
||||
CleanerProperties cp = new CleanerProperties();
|
||||
cp.setDeserializeEntities(true);
|
||||
cleaner = new HtmlCleaner(cp);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() {
|
||||
cleaner = null;
|
||||
}
|
||||
|
||||
private void doTest(String input, String output) {
|
||||
assertEquals(
|
||||
output,
|
||||
cleaner.clean("<html><body>" + input + "</body></html>")
|
||||
.findElementByName("body", true)
|
||||
.getText()
|
||||
.toString()
|
||||
);
|
||||
}
|
||||
|
||||
public void testNamedEntity() {
|
||||
doTest(""", "\"");
|
||||
}
|
||||
|
||||
public void testDecimalEntity() {
|
||||
doTest(" ", "\u00a0");
|
||||
}
|
||||
|
||||
public void testHexadecimalEntity() {
|
||||
doTest(" ", "\u00a0");
|
||||
}
|
||||
|
||||
public void testAbortedEntity() {
|
||||
doTest("&"", "&\"");
|
||||
}
|
||||
|
||||
public void testCData() {
|
||||
doTest("<script>"+CData.BEGIN_CDATA + "&" + CData.END_CDATA+"</script>", "&");
|
||||
}
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,236 @@
|
||||
/* Copyright (c) 2006-2013, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.jdom2.Document;
|
||||
import org.jdom2.Namespace;
|
||||
import org.jdom2.output.Format;
|
||||
import org.jdom2.output.XMLOutputter;
|
||||
import org.junit.Test;
|
||||
|
||||
public class JDomSerializerTest extends AbstractHtmlCleanerTest {
|
||||
|
||||
//
|
||||
// Test that we create valid element names
|
||||
//
|
||||
@Test
|
||||
public void elementNames() throws IOException{
|
||||
String initial = "<img srcset=\"<p%20\">";
|
||||
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head /><body><img srcset=\"\" /><p20 /></body></html>\n";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setAddNewlineToHeadAndBody(false);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(initial);
|
||||
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String output = outputter.outputString(doc);
|
||||
assertEquals(expected, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we comment CDATA in JDom
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void safeCData1() throws IOException{
|
||||
String initial = "<head><script type=\"text/javascript\"><![CDATA[alert(\"Hello World\")]]></script></head>";
|
||||
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitCdataOutsideScriptAndStyle(true);
|
||||
props.setAddNewlineToHeadAndBody(false);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(initial);
|
||||
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String output = outputter.outputString(doc);
|
||||
assertEquals(expected, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we comment CDATA in JDom; in this case preserving existing comments
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void safeCData2() throws IOException{
|
||||
String initial = "<head><script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script></head>";
|
||||
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitCdataOutsideScriptAndStyle(true);
|
||||
props.setAddNewlineToHeadAndBody(false);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(initial);
|
||||
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String output = outputter.outputString(doc);
|
||||
assertEquals(expected, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we comment CDATA in JDom; in this case that we normalise comment style
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void safeCData3() throws IOException{
|
||||
String initial = "<head><script type=\"text/javascript\">/*<![CDATA[*/alert(\"Hello World\")\n/*]]>*/</script></head>";
|
||||
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitCdataOutsideScriptAndStyle(true);
|
||||
props.setAddNewlineToHeadAndBody(false);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(initial);
|
||||
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String output = outputter.outputString(doc);
|
||||
assertEquals(expected, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we comment CDATA in JDom; in this case a more complex example
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void safeCData4() throws IOException{
|
||||
String initial = readFile("src/test/resources/test33.html");
|
||||
String expected = readFile("src/test/resources/test33_expected.html");;
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitCdataOutsideScriptAndStyle(true);
|
||||
props.setAddNewlineToHeadAndBody(false);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(initial);
|
||||
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String output = outputter.outputString(doc);
|
||||
assertEquals(expected, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we comment CDATA in JDom
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void safeCData5() throws IOException{
|
||||
String initial = "<head><script><></script></head>";
|
||||
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script>/*<![CDATA[*/\n<>\n/*]]>*/</script></head><body /></html>\n";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitCdataOutsideScriptAndStyle(true);
|
||||
props.setUseCdataForScriptAndStyle(true);
|
||||
props.setDeserializeEntities(true);
|
||||
props.setAddNewlineToHeadAndBody(false);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(initial);
|
||||
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String output = outputter.outputString(doc);
|
||||
assertEquals(expected, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we comment CDATA in JDom; this test uses CSS
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void safeCData6() throws IOException{
|
||||
String initial = "<head><style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style></head>";
|
||||
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>\n";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitCdataOutsideScriptAndStyle(true);
|
||||
props.setUseCdataForScriptAndStyle(true);
|
||||
props.setAddNewlineToHeadAndBody(false);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(initial);
|
||||
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
|
||||
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
|
||||
String output = outputter.outputString(doc);
|
||||
assertEquals(expected, output);
|
||||
}
|
||||
|
||||
/**
|
||||
* See issue #95
|
||||
*/
|
||||
@Test
|
||||
public void testNPE(){
|
||||
String validhtml5StringCode = "<html></html>";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setOmitHtmlEnvelope(true);
|
||||
TagNode tagNode = new HtmlCleaner(props).clean(validhtml5StringCode);
|
||||
new JDomSerializer(props, true).createJDom(tagNode);
|
||||
}
|
||||
|
||||
/**
|
||||
* See issue 106
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void CDATA() throws Exception{
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
|
||||
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
|
||||
String initial = readFile("src/test/resources/test22.html");
|
||||
TagNode tagNode = cleaner.clean(initial);
|
||||
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
|
||||
Document doc = ser.createJDom(tagNode);
|
||||
assertEquals("org.jdom2.CDATA", doc.getRootElement().getChild("head").getChild("script").getContent().get(1).getClass().getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* See issue 106
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void noCDATA() throws Exception{
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(false);
|
||||
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
|
||||
String initial = readFile("src/test/resources/test22.html");
|
||||
TagNode tagNode = cleaner.clean(initial);
|
||||
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
|
||||
Document doc = ser.createJDom(tagNode);
|
||||
assertEquals("org.jdom2.Text", doc.getRootElement().getChild("head").getChild("script").getContent().get(0).getClass().getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* Test we handle foreign markup OK
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
public void namespaces() throws Exception{
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
String initial = readFile("src/test/resources/test21.html");
|
||||
TagNode tagNode = cleaner.clean(initial);
|
||||
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
|
||||
Document doc = ser.createJDom(tagNode);
|
||||
|
||||
//
|
||||
// These will fail with an NPE if the namespaces are not correct
|
||||
//
|
||||
doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getNamespaceURI();
|
||||
doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getChild("svg", Namespace.getNamespace("http://www.w3.org/2000/svg")).getNamespaceURI();
|
||||
doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getChild("svg", Namespace.getNamespace("http://www.w3.org/2000/svg")).getChild("title", Namespace.getNamespace("http://www.w3.org/2000/svg"));
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
/* Copyright (c) 2006-2017, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class MathMLTest extends AbstractHtmlCleanerTest{
|
||||
|
||||
/**
|
||||
* Check that inline MathML statements remain inline. See bug #193
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void mathMLNamespaces() throws IOException{
|
||||
String input = readFile("src/test/resources/test35.html");
|
||||
String expected = readFile("src/test/resources/test35_expected.html");
|
||||
assertCleaned(input,expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check that MathML is properly formed. See bug #204
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void mathML() throws IOException{
|
||||
String input = readFile("src/test/resources/test36.html");
|
||||
String expected = readFile("src/test/resources/test36_expected.html");
|
||||
assertCleaned(input,expected);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
/* Copyright (c) 2006-2013, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class NamespacesTest extends AbstractHtmlCleanerTest{
|
||||
|
||||
|
||||
/**
|
||||
* Tests that we can add in the xlink NS declaration automatically if there is an xlink:href attribute with
|
||||
* no xmlns attribute.
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void missingDeclaration() throws IOException{
|
||||
String initial = "<p xlink:href=\"#someHeading\"/>";
|
||||
String expected = "<html xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<head />\n<body><p xlink:href=\"#someHeading\"></p></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we can handle XMLNS="" attributes. See issue #135
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void xmlnsAttributeInUpperCase() throws IOException{
|
||||
String initial = "<BANANA XMLNS=\"BANANA\"/>";
|
||||
String expected = "<html>\n<head />\n<body><BANANA XMLNS=\"BANANA\" /></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
@Test
|
||||
public void xmlnsAttributeAndPrefix() throws IOException{
|
||||
String initial = "\n<head />\n<body><xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\"/>";
|
||||
String expected = "<html>\n<head />\n<body>\n<xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\" /></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
@Test
|
||||
public void xmlnsAttributeAndPrefix2() throws IOException{
|
||||
String initial = "<xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\"/>";
|
||||
String expected = "<html>\n<head />\n<body><xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\" /></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we can handle xmlns="" attributes. See issue #135
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void emptyNamespaces() throws IOException{
|
||||
String initial = readFile("src/test/resources/test32.html");
|
||||
String expected = "<html>\n<head />\n<body><a href=\"link.html\"><img /></a><p>Text</p></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses an RDFa example to test that we retain namespace declarations. See issue #63
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void RDFa() throws IOException{
|
||||
String initial = readFile("src/test/resources/test13.html");
|
||||
String expected = readFile("src/test/resources/test13_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses a namespace prefix for an element. See issue #63
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void DCElement() throws IOException{
|
||||
String initial = readFile("src/test/resources/test14.html");
|
||||
String expected = readFile("src/test/resources/test14_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uses a namespace prefix for an attribute. See issue #63
|
||||
* @throws IOException
|
||||
*/
|
||||
@Test
|
||||
public void DCAttribute() throws IOException{
|
||||
String initial = readFile("src/test/resources/test15.html");
|
||||
String expected = readFile("src/test/resources/test15_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* If we aren't NS aware, strip out the xmlns attr and process everything
|
||||
* as HTML.
|
||||
*/
|
||||
@Test
|
||||
public void testTableCellsWithoutNamespaceAwareness() throws IOException{
|
||||
cleaner.getProperties().setNamespacesAware(false);
|
||||
String initial = readFile("src/test/resources/test26.html");
|
||||
String expected = readFile("src/test/resources/test26_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* If we are namespace-aware and use the legacy HTML namespace, we should
|
||||
* treat the content as HTML. See issue #115
|
||||
*/
|
||||
@Test
|
||||
public void testTableCellsUsingNamespaceAwareAndLegacyHtmlNS() throws IOException{
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
cleaner.getProperties().setOmitUnknownTags(true);
|
||||
String initial = readFile("src/test/resources/test26.html");
|
||||
String expected = readFile("src/test/resources/test26_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* If we're NS-aware and using XHTML, treat the content as HTML tags and
|
||||
* insert TBODY into the table (etc) but retain the xmlns attr on the html
|
||||
* tag
|
||||
*/
|
||||
@Test
|
||||
public void testTableCellsUsingNamespaceAwareAndXhtmlNS() throws IOException{
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
cleaner.getProperties().setOmitUnknownTags(true);
|
||||
String initial = readFile("src/test/resources/test27.html");
|
||||
String expected = readFile("src/test/resources/test27_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/**
|
||||
* If we are namespace-aware and use an unknown namespace,
|
||||
* all the content will be treated as foreign markup; this means
|
||||
* there will be no insertion of TBODY tags as the table element
|
||||
* is not interpreted as being a HTML table element
|
||||
*/
|
||||
@Test
|
||||
public void testTableCellsUsingNamespaceAwareAndUnknownNS() throws IOException{
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
cleaner.getProperties().setOmitUnknownTags(true);
|
||||
String initial = readFile("src/test/resources/test28.html");
|
||||
String expected = readFile("src/test/resources/test28_expected.html");
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
public class NestingTest extends TestCase {
|
||||
|
||||
public final static int TOO_DEEP_NESTING = 9999;
|
||||
public final static String TOO_DEEP_DOC = _nestedDoc(TOO_DEEP_NESTING, "<div>", "</div>", "");
|
||||
|
||||
public static String _nestedDoc(int nesting, String open, String close, String content) {
|
||||
StringBuilder sb = new StringBuilder(nesting * (open.length() + close.length()));
|
||||
for (int i = 0; i < nesting; ++i) {
|
||||
sb.append(open);
|
||||
if ((i & 31) == 0) {
|
||||
sb.append("\n");
|
||||
}
|
||||
}
|
||||
sb.append("\n").append(content).append("\n");
|
||||
for (int i = 0; i < nesting; ++i) {
|
||||
sb.append(close);
|
||||
if ((i & 31) == 0) {
|
||||
sb.append("\n");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDeepNesting(){
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
TagNode root = cleaner.clean(TOO_DEEP_DOC);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,663 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Testing node manipulation after cleaning.
|
||||
* TODO String escaping tests should be moved to UtilsTest class [Eugene]
|
||||
* @author Eugene Sapozhnikov (blackorangebox@gmail.com)
|
||||
*/
|
||||
public class PropertiesTest extends TestCase {
|
||||
|
||||
/**
|
||||
* Test behavour of creating a new cleaner with properties including
|
||||
* tag provider set
|
||||
* @throws Exception
|
||||
*/
|
||||
public void initialiseCleanerWithProperties() throws Exception {
|
||||
CleanerProperties properties = new CleanerProperties();
|
||||
properties.setTagInfoProvider(Html5TagProvider.INSTANCE);
|
||||
HtmlCleaner cleaner = new HtmlCleaner(properties);
|
||||
assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
|
||||
|
||||
properties = new CleanerProperties();
|
||||
properties.setTagInfoProvider(null);
|
||||
cleaner = new HtmlCleaner(properties);
|
||||
assertTrue(cleaner.getTagInfoProvider() instanceof DefaultTagProvider);
|
||||
|
||||
properties = new CleanerProperties();
|
||||
properties.setTagInfoProvider(null);
|
||||
cleaner = new HtmlCleaner(null,properties);
|
||||
assertTrue(cleaner.getTagInfoProvider() instanceof DefaultTagProvider);
|
||||
|
||||
properties = new CleanerProperties();
|
||||
properties.setTagInfoProvider(null);
|
||||
cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE, properties);
|
||||
assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
|
||||
|
||||
properties = new CleanerProperties();
|
||||
properties.setTagInfoProvider(DefaultTagProvider.INSTANCE);
|
||||
cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE, properties);
|
||||
assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
|
||||
}
|
||||
|
||||
public void testPropertiesAdvancedXmlEscape() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setNamespacesAware(false);
|
||||
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div>&"'<></div>") >= 0);
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString, xmlString.indexOf("<div>&amp;&quot;&apos;&lt;&gt;</div>") >= 0);
|
||||
}
|
||||
|
||||
public void testUseCdataForScriptAndStyle() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
properties.setUseCdataForScriptAndStyle(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
String expected = "<script>" + CData.SAFE_BEGIN_CDATA + "\nvar x=y&&z;\n" + CData.SAFE_END_CDATA
|
||||
+ "</script>";
|
||||
assertTrue("looking for :\"" + expected + "\" in :\n" + xmlString, xmlString.indexOf(expected) >= 0);
|
||||
expected = "<style>" + CData.SAFE_BEGIN_CDATA + "\n.test{font-size:10;}\n" + CData.SAFE_END_CDATA
|
||||
+ "</style>";
|
||||
assertTrue("looking for :\"" + expected + "\" in :\n" + xmlString, xmlString.indexOf(expected) >= 0);
|
||||
properties.setUseCdataForScriptAndStyle(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<script>var x=y&&z;</script>") >= 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<style>.test{font-size:10;}</style>") >= 0);
|
||||
}
|
||||
|
||||
public void testTranslateSpecialEntities() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setTranslateSpecialEntities(true);
|
||||
String specialHtmlEntities = "<div>" + new String(new char[] { 244, 8240, 215, 376, 8364 }) + "</div>";
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf(specialHtmlEntities) >= 0);
|
||||
properties.setTranslateSpecialEntities(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf(specialHtmlEntities) < 0);
|
||||
}
|
||||
|
||||
public void testRecognizeUnicodeChars() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
String unicodeCharString = "<div>" + new String(new char[] { 352, 8224, 8249 }) + "</div>";
|
||||
properties.setRecognizeUnicodeChars(true);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf(unicodeCharString) >= 0);
|
||||
properties.setRecognizeUnicodeChars(false);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf(unicodeCharString) < 0);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<div>&#352;&#8224;&#8249;</div>") >= 0);
|
||||
}
|
||||
|
||||
public void testOmitUnknownTags() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setOmitUnknownTags(true);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag>content of unknown tag</mytag>") < 0);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("content of unknown tag") >= 0);
|
||||
properties.setOmitUnknownTags(false);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag>content of unknown tag</mytag>") >= 0);
|
||||
}
|
||||
|
||||
public void testTreatUnknownTagsAsContent() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setOmitUnknownTags(false);
|
||||
properties.setTreatUnknownTagsAsContent(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<mytag>content of unknown tag</mytag>") >= 0);
|
||||
properties.setTreatUnknownTagsAsContent(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<mytag>content of unknown tag</mytag>") >= 0);
|
||||
}
|
||||
|
||||
public void testNamespacesAware() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setNamespacesAware(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<html xmlns:my=\"my\">") >= 0);
|
||||
assertTrue(xmlString.indexOf("<my:tag id=\"xxx\">aaa</my:tag>") >= 0);
|
||||
properties.setNamespacesAware(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<html") >= 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<tag id=\"xxx\">aaa</tag>") >= 0);
|
||||
}
|
||||
|
||||
public void testOmitDeprecatedTags() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setOmitDeprecatedTags(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") < 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("content of deprecated tag") >= 0);
|
||||
properties.setOmitDeprecatedTags(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") >= 0);
|
||||
}
|
||||
|
||||
public void testTreatDeprecatedTagsAsContent() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setOmitDeprecatedTags(false);
|
||||
properties.setTreatDeprecatedTagsAsContent(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") >= 0);
|
||||
properties.setTreatDeprecatedTagsAsContent(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") >= 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testOmitComments() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setOmitComments(false);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") >= 0);
|
||||
properties.setOmitComments(true);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") < 0);
|
||||
}
|
||||
|
||||
public void testUseEmptyElementTags() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
// Tag <a> connot be collapsed according to DefaultTagProvider
|
||||
properties.setUseEmptyElementTags(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<a href=\"index.php\" />") < 0);
|
||||
assertTrue(xmlString.indexOf("<a href=\"index.php\"></a>") >= 0);
|
||||
|
||||
properties.setUseEmptyElementTags(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<a href=\"index.php\"></a>") >= 0);
|
||||
|
||||
properties.setUseEmptyElementTags(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<br />") >= 0);
|
||||
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
// jericho reports that td can not be empty. so we test on <tr/>
|
||||
// collapsing
|
||||
assertTrue(xmlString, xmlString.indexOf("<tr><td></td></tr><tr />") >= 0);
|
||||
properties.setUseEmptyElementTags(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<table><tbody><tr><td></td></tr><tr></tr></tbody></table>") >= 0);
|
||||
}
|
||||
|
||||
public void testAllowMultiWordAttributes() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
properties.setUseEmptyElementTags(false);
|
||||
properties.setAllowMultiWordAttributes(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div att=\"a b c\">") < 0);
|
||||
assertTrue(xmlString.indexOf("<div att=\"a\" b=\"b\" c=\"c\">") >= 0);
|
||||
properties.setAllowMultiWordAttributes(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div att=\"a b c\">") >= 0);
|
||||
|
||||
properties.setAllowHtmlInsideAttributes(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<a title=\"<b>Title<b> is here\">LINK 1</a>") >= 0);
|
||||
properties.setAllowHtmlInsideAttributes(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<a title=\"<b>Title<b> is here\">LINK 1</a>") < 0);
|
||||
assertTrue(xmlString.indexOf("<a title=\"\"><b>Title<b> is here">LINK 1</b></b></a>") >= 0);
|
||||
|
||||
properties.setIgnoreQuestAndExclam(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<!INSTRUCTION1 id="aaa">") < 0);
|
||||
assertTrue(xmlString.indexOf("<?INSTRUCTION2 id="bbb">") < 0);
|
||||
properties.setIgnoreQuestAndExclam(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<!INSTRUCTION1 id="aaa">") >= 0);
|
||||
assertTrue(xmlString.indexOf("<?INSTRUCTION2 id="bbb">") >= 0);
|
||||
|
||||
properties.setNamespacesAware(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<html xmlns:my=\"my\">") >= 0);
|
||||
assertTrue(xmlString.indexOf("<my:tag id=\"xxx\">aaa</my:tag>") >= 0);
|
||||
properties.setNamespacesAware(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<html") >= 0);
|
||||
assertTrue(xmlString.indexOf("<tag id=\"xxx\">aaa</tag>") >= 0);
|
||||
}
|
||||
public void testAllowHtmlInsideAttributes() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setAllowHtmlInsideAttributes(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue( xmlString.indexOf("<a title=\"<b>Title<b> is here\">LINK 1</a>") >= 0 );
|
||||
properties.setAllowHtmlInsideAttributes(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue( xmlString.indexOf("<a title=\"<b>Title<b> is here\">LINK 1</a>") < 0 );
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue( xmlString.indexOf("<a title=\"\"><b>Title<b> is here">LINK 1</b></b></a>") >= 0 );
|
||||
}
|
||||
public void testIgnoreQuestAndExclam() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setIgnoreQuestAndExclam(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue( xmlString.indexOf("<!INSTRUCTION1 id="aaa">") < 0 );
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue( xmlString.indexOf("<?INSTRUCTION2 id="bbb">") < 0 );
|
||||
properties.setIgnoreQuestAndExclam(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue( xmlString.indexOf("<!INSTRUCTION1 id="aaa">") >= 0 );
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue( xmlString.indexOf("<?INSTRUCTION2 id="bbb">") >= 0 );
|
||||
}
|
||||
/**
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testComments() throws IOException {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setOmitComments(false);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") >= 0);
|
||||
properties.setOmitComments(true);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") < 0);
|
||||
|
||||
properties.setOmitComments(false);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<!-- comment with == - hyphen -->") >= 0);
|
||||
properties.setHyphenReplacementInComment("*");
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<!-- comment with ** - hyphen -->") >= 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testOmitXmlDeclaration() throws IOException {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setOmitXmlDeclaration(false);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<?xml version=\"1.0\"") >= 0);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<?xml version=\"1.0\"") < 0);
|
||||
}
|
||||
|
||||
public void testOmitDoctypeDeclaration() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
properties.setAdvancedXmlEscape(false);
|
||||
|
||||
properties.setOmitDoctypeDeclaration(false);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf(
|
||||
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">") >= 0);
|
||||
properties.setOmitDoctypeDeclaration(true);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf(
|
||||
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">") < 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testOmitHtmlEnvelope() throws IOException {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setHtmlVersion(4);
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setAddNewlineToHeadAndBody(false);
|
||||
String xmlString;
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<html><head>") < 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("</body></html>") < 0);
|
||||
properties.setOmitHtmlEnvelope(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString, xmlString.indexOf("<html><head>") >= 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString, xmlString.indexOf("</body></html>") >= 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws IOException
|
||||
*/
|
||||
public void testOmitHtml5Envelope() throws IOException {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setHtmlVersion(5);
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setAddNewlineToHeadAndBody(false);
|
||||
String xmlString;
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<html><head>") < 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("</body></html>") < 0);
|
||||
properties.setOmitHtmlEnvelope(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString, xmlString.indexOf("<html><head><style>") >= 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString, xmlString.indexOf("</body></html>") >= 0);
|
||||
}
|
||||
|
||||
public void testPruneProperties() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
|
||||
properties.reset();
|
||||
properties.setPruneTags("div,mytag");
|
||||
String xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div") < 0);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag") < 0);
|
||||
properties.setPruneTags("");
|
||||
properties.setAllowTags("html,body,div");
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div") >= 0);
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag") < 0);
|
||||
}
|
||||
|
||||
public void testEmptyAttributesProperties() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
|
||||
properties.reset();
|
||||
String xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<input checked=\"checked\" />") >= 0);
|
||||
properties.setBooleanAttributeValues("empty");
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"\" />") >= 0);
|
||||
properties.setBooleanAttributeValues("true");
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"true\" />") >= 0);
|
||||
properties.setBooleanAttributeValues("selft");
|
||||
assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"checked\" />") >= 0);
|
||||
}
|
||||
|
||||
private String getXmlString(HtmlCleaner cleaner, CleanerProperties properties) throws IOException {
|
||||
TagNode node = cleaner.clean(new File("src/test/resources/test4.html"), "UTF-8");
|
||||
String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
|
||||
return xmlString;
|
||||
}
|
||||
|
||||
public void testNbsp() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setTranslateSpecialEntities(false);
|
||||
properties.setOmitDoctypeDeclaration(false);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
properties.setAdvancedXmlEscape(true);
|
||||
properties.setAddNewlineToHeadAndBody(false);
|
||||
|
||||
// test first when generating xml
|
||||
TagNode node = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
|
||||
+ "<div> &"''<> &garbage;&</div>");
|
||||
SimpleXmlSerializer simpleXmlSerializer = new SimpleXmlSerializer(properties);
|
||||
String xmlString = simpleXmlSerializer.getAsString(node, "UTF-8");
|
||||
assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
|
||||
+ "<html><head /><body><div> &"''<>" + String.valueOf((char) 160)
|
||||
+ "&garbage;&</div></body></html>", xmlString.trim());
|
||||
|
||||
simpleXmlSerializer.setCreatingHtmlDom(true);
|
||||
// then test when generating html
|
||||
String domString = simpleXmlSerializer.getAsString(node, "UTF-8");
|
||||
assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" +
|
||||
// "<html><head /><body><div> &"''<> &garbage;&</div></body></html>",
|
||||
"<html><head /><body><div> &"''<> &garbage;&</div></body></html>",
|
||||
domString.trim());
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure that the unicode character has leading 'x'.
|
||||
* <ul>
|
||||
* <li>ŠA; is converted by FF to 3 characters: Š + 'A' + ';'</li>
|
||||
* <li>�x138A; is converted by FF to 6? 7? characters: � 'x'+'1'+'3'+
|
||||
* '8' + 'A' + ';' #0 is displayed kind of weird</li>
|
||||
* <li>ᎊ is a single character</li>
|
||||
* </ul>
|
||||
*
|
||||
* @throws Exception
|
||||
*/
|
||||
public void testHexConversion() throws Exception {
|
||||
CleanerProperties properties = new CleanerProperties();
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
SimpleXmlSerializer simpleXmlSerializer = new SimpleXmlSerializer(properties);
|
||||
simpleXmlSerializer.setCreatingHtmlDom(false);
|
||||
|
||||
String xmlString = simpleXmlSerializer.getAsString( "<div>ŠA;</div>");
|
||||
assertEquals("<div>"+new String(new char[] {138, 'A',';'})+"</div>", xmlString);
|
||||
xmlString = simpleXmlSerializer.getAsString( "<div>ᎊ</div>");
|
||||
assertEquals("<div>"+new String(new char[] {0x138A})+"</div>", xmlString);
|
||||
properties.reset();
|
||||
|
||||
}
|
||||
|
||||
public void testPattern() {
|
||||
for (Object[] test : new Object[][] {
|
||||
new Object[] { "0x138A;", false, -1, -1, null, true, 0, 7, "x138A", true, 0, 1, "0" },
|
||||
new Object[] { "x138A;", true, 0, 6, "x138A", true, 0, 6, "x138A", false, -1, -1, null },
|
||||
new Object[] { "138;", false, -1, -1, null, false, -1, -1, null, true, 0, 4, "138" },
|
||||
new Object[] { "139", false, -1, -1, null, false, -1, -1, null, true, 0, 3, "139" },
|
||||
new Object[] { "x13A", true, 0, 4, "x13A", true, 0, 4, "x13A", false, -1, -1, null },
|
||||
new Object[] { "13F", false, -1, -1, null, false, -1, -1, null, true, 0, 2, "13" },
|
||||
new Object[] { "13", false, -1, -1, null, false, -1, -1, null, true, 0, 2, "13" },
|
||||
new Object[] { "X13AZ", true, 0, 4, "X13A", true, 0, 4, "X13A", false, -1, -1, null } }) {
|
||||
int i = 0;
|
||||
String input = (String) test[i++];
|
||||
boolean strict = (Boolean) test[i++];
|
||||
int sstart = (Integer) test[i++];
|
||||
int send = (Integer) test[i++];
|
||||
String sgroup = (String) test[i++];
|
||||
boolean relaxed = (Boolean) test[i++];
|
||||
int rstart = (Integer) test[i++];
|
||||
int rend = (Integer) test[i++];
|
||||
String rgroup = (String) test[i++];
|
||||
boolean decimal = (Boolean) test[i++];
|
||||
int dstart = (Integer) test[i++];
|
||||
int dend = (Integer) test[i++];
|
||||
String dgroup = (String) test[i++];
|
||||
Matcher m = Utils.HEX_STRICT.matcher(input);
|
||||
boolean actual = m.find();
|
||||
assertEquals(input, strict, actual);
|
||||
if (actual) {
|
||||
assertEquals(input + " strict start ", sstart, m.start());
|
||||
assertEquals(input + " strict end ", send, m.end());
|
||||
assertEquals(input + " strict group ", sgroup, m.group(1));
|
||||
}
|
||||
m = Utils.HEX_RELAXED.matcher(input);
|
||||
actual = m.find();
|
||||
assertEquals(input, relaxed, actual);
|
||||
if (actual) {
|
||||
assertEquals(input + " relaxed start ", rstart, m.start());
|
||||
assertEquals(input + " relaxed end ", rend, m.end());
|
||||
assertEquals(input + " relaxed group ", rgroup, m.group(1));
|
||||
}
|
||||
m = Utils.DECIMAL.matcher(input);
|
||||
actual = m.find();
|
||||
assertEquals(input, decimal, actual);
|
||||
if (actual) {
|
||||
assertEquals(input + " decimal start ", dstart, m.start());
|
||||
assertEquals(input + " decimal end ", dend, m.end());
|
||||
assertEquals(input + " decimal group ", dgroup, m.group(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testConvertUnicode() throws Exception {
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitHtmlEnvelope(true);
|
||||
cleanerProperties.setOmitXmlDeclaration(true);
|
||||
cleanerProperties.setUseEmptyElementTags(false);
|
||||
// right tick is special unicode character 8217
|
||||
String output = new SimpleXmlSerializer(cleanerProperties).getAsString(
|
||||
"<h3><u><strong>President’s Message</strong></u><div> </h3>");
|
||||
assertEquals("<h3><u><strong>President’s Message</strong></u><div> </div></h3>", output);
|
||||
}
|
||||
|
||||
private static final String HTML_COMMENT_OUT_BEGIN = "<html><head><script>";
|
||||
private static final String HTML_COMMENT_OUT_END = "</script></head><body></body></html>";
|
||||
private static final String SAMPLE_JS = "var x = ['foo','bar'];";
|
||||
private static final String COMMENT_START = "<!--";
|
||||
private static final String COMMENT_END = "-->";
|
||||
|
||||
/**
|
||||
* Test conversion of former ( now bad practice ) of:
|
||||
*
|
||||
* <pre>
|
||||
* <style><!-- style info --></style>
|
||||
* </pre>
|
||||
*
|
||||
* into <style>/(star)<![CDATA[(star)/ style info
|
||||
* /(star)]]>(star)/</style>
|
||||
*
|
||||
* Note: disabled because it doesn't test actual behavior
|
||||
* @throws IOException
|
||||
*/
|
||||
public void disabledTestConvertOldStyleComments() throws IOException {
|
||||
// TODO: May need additional flag to handle '<' inside of scripts
|
||||
// dontEscape() in xml serializer should not be triggered based on use
|
||||
// cdata
|
||||
// but dontEscape is used by subclasses -- need to investigate best
|
||||
// solution.
|
||||
// maybe o.k. to have the < > be translated. That is what original test
|
||||
// does.
|
||||
// but the ' should probably not be touched??
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = new CleanerProperties();
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
properties.setUseCdataForScriptAndStyle(true);
|
||||
properties.setAddNewlineToHeadAndBody(false);
|
||||
// test for positive matches to old-style comment hacks
|
||||
for (String[] testData : new String[][] {
|
||||
// normal case - remove old-style comment out hack
|
||||
new String[] {
|
||||
HTML_COMMENT_OUT_BEGIN + "//" + COMMENT_START + "\n" + SAMPLE_JS + "//" + COMMENT_END + "\n"
|
||||
+ HTML_COMMENT_OUT_END,
|
||||
HTML_COMMENT_OUT_BEGIN + CData.SAFE_BEGIN_CDATA + "\n" + SAMPLE_JS
|
||||
+ CData.SAFE_END_CDATA + "\n" + HTML_COMMENT_OUT_END },
|
||||
// don't let random whitespace confuse things
|
||||
new String[] {
|
||||
HTML_COMMENT_OUT_BEGIN + "\n\n\n\n" + "//" + " \t" + COMMENT_START + "\n" + SAMPLE_JS
|
||||
+ "\n\n\n" + "//" + COMMENT_END + "\n\n\t\n" + HTML_COMMENT_OUT_END,
|
||||
HTML_COMMENT_OUT_BEGIN + "\n\n\n\n" + CData.SAFE_BEGIN_CDATA + "\n" + SAMPLE_JS
|
||||
+ "\n\n\n" + "//" + CData.SAFE_END_CDATA + "\n\n\t\n" + HTML_COMMENT_OUT_END },
|
||||
|
||||
}) {
|
||||
doTestConvertOldStyleComments(cleaner, properties, testData);
|
||||
}
|
||||
|
||||
// test for false positives
|
||||
for (String[] testData : new String[][] {
|
||||
// make sure not to remove real comments
|
||||
new String[] {
|
||||
HTML_COMMENT_OUT_BEGIN + "//" + "an ordinary comment" + "\n" + SAMPLE_JS + "//" + "a final remark"
|
||||
+ HTML_COMMENT_OUT_END,
|
||||
HTML_COMMENT_OUT_BEGIN + CData.SAFE_BEGIN_CDATA + "//" + "an ordinary comment" + "\n"
|
||||
+ SAMPLE_JS + "//" + "a final remark" + CData.SAFE_END_CDATA + HTML_COMMENT_OUT_END }, }) {
|
||||
doTestConvertOldStyleComments(cleaner, properties, testData);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param cleaner
|
||||
* @param properties
|
||||
* @param testData
|
||||
*/
|
||||
private void doTestConvertOldStyleComments(HtmlCleaner cleaner, CleanerProperties properties, String[] testData)
|
||||
throws IOException {
|
||||
TagNode node = cleaner.clean(testData[0]);
|
||||
// test to make sure the no-op still works
|
||||
properties.setUseCdataForScriptAndStyle(false);
|
||||
String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
|
||||
assertEquals(testData[0], xmlString);
|
||||
|
||||
// now test actual
|
||||
properties.setUseCdataForScriptAndStyle(true);
|
||||
xmlString = new SimpleXmlSerializer(properties).getAsString(node);
|
||||
assertEquals(testData[1], xmlString);
|
||||
}
|
||||
|
||||
public void testIgnoreClosingCData() throws IOException {
|
||||
String html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
|
||||
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta http-equiv=\"content-type\" content=\"application/xhtml+xml; charset=utf-8\" /><link href=\"aswa.css\" type=\"text/css\" rel=\"stylesheet\" /><title>ASWA - Events</title>"
|
||||
+ "<style type=\"text/css\">/*<![CDATA[*/\r\n"
|
||||
+ "#ampmep_188 { }\r\n"
|
||||
+ "/*]]>*/</style></head><body></body></html>";
|
||||
|
||||
CleanerProperties properties = new CleanerProperties();
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
properties.setUseCdataForScriptAndStyle(true);
|
||||
properties.setAddNewlineToHeadAndBody(false);
|
||||
properties.setIgnoreQuestAndExclam(false);
|
||||
HtmlCleaner cleaner = new HtmlCleaner(properties);
|
||||
TagNode node = cleaner.clean(html);
|
||||
//properties.setUseCdataForScriptAndStyle(false);
|
||||
String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
|
||||
assertEquals(html, xmlString);
|
||||
}
|
||||
|
||||
public void testTransResCharsToNCR() throws Exception {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
String xmlString;
|
||||
|
||||
properties.setNamespacesAware(false);
|
||||
properties.setAdvancedXmlEscape(true);
|
||||
properties.setTransResCharsToNCR(true);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div>1.&"'<></div>") >= 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div>2.&"'<></div>") >= 0);
|
||||
properties.setTransResCharsToNCR(false);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div>1.&"'<></div>") >= 0);
|
||||
xmlString = getXmlString(cleaner, properties);
|
||||
assertTrue(xmlString.indexOf("<div>2.&"'<></div>") >= 0);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* Simple test to check that randomly appeared end tags are dropped out.
|
||||
*/
|
||||
public class RandomCloseTagTest extends TestCase {
|
||||
|
||||
public void testRandomCloseTagsRemoved() throws IOException{
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
CleanerProperties properties = cleaner.getProperties();
|
||||
properties.setOmitHtmlEnvelope(true);
|
||||
properties.setOmitXmlDeclaration(true);
|
||||
SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
|
||||
TagNode cleaned = cleaner.clean("Some</span> text </b></div>");
|
||||
assertEquals("Some text ", serializer.getAsString(cleaned));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
public class RandomPageTest extends TestCase {
|
||||
|
||||
public void testPage() throws IOException {
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
cleaner.clean( new File("src/test/resources/gg_prob.html") );
|
||||
}
|
||||
|
||||
public void testHtml() throws IOException{
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
cleaner.clean( new File("src/test/resources/gg_prob_cleaned.html") );
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,223 @@
|
||||
/* Copyright (c) 2006-2013, the HtmlCleaner Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use of this software in source and binary forms,
|
||||
with or without modification, are permitted provided that the following
|
||||
conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other
|
||||
materials provided with the distribution.
|
||||
|
||||
* The name of HtmlCleaner may not be used to endorse or promote
|
||||
products derived from this software without specific prior
|
||||
written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package org.htmlcleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
public class SVGTest extends AbstractHtmlCleanerTest{
|
||||
|
||||
@Test
|
||||
public void svgTreatedAsPhrasing() throws IOException
|
||||
{
|
||||
CleanerProperties cleanerProperties = new CleanerProperties();
|
||||
cleanerProperties.setOmitXmlDeclaration(false);
|
||||
cleanerProperties.setOmitDoctypeDeclaration(false);
|
||||
cleanerProperties.setIgnoreQuestAndExclam(false);
|
||||
cleanerProperties.setAddNewlineToHeadAndBody(false);
|
||||
cleanerProperties.setUseCdataFor("script,style,altscript");
|
||||
this.cleaner = new HtmlCleaner(cleanerProperties);
|
||||
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
|
||||
|
||||
assertHTML(
|
||||
"<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\"><circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\" /></svg></p>",
|
||||
"<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\"><circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\"></circle></svg></p>"
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void nestedSVG()
|
||||
{
|
||||
String html = "<!DOCTYPE html>\n"
|
||||
+ "<html lang=\"en\">\n"
|
||||
+ "<head>\n"
|
||||
+ "</head>\n"
|
||||
+ "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
|
||||
+ "<svg xmlns=\"http://www.w3.org/2000/\">\n"
|
||||
+ " <svg></svg>\n"
|
||||
+ "</svg>\n"
|
||||
+ "</body>\n"
|
||||
+ "</html>";
|
||||
new HtmlCleaner().clean(html);
|
||||
|
||||
html = "<!DOCTYPE html>\n"
|
||||
+ "<html lang=\"en\">\n"
|
||||
+ "<head>\n"
|
||||
+ "</head>\n"
|
||||
+ "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
|
||||
+ "<svg xmlns=\"http://www.w3.org/2000/svg\">\n"
|
||||
+ " <circle cx=\"50\" cy=\"50\" r=\"40\" stroke=\"black\" stroke-width=\"3\" fill=\"red\" />\n"
|
||||
+ "</svg>\n"
|
||||
+ "</body>\n"
|
||||
+ "</html>";
|
||||
new HtmlCleaner().clean(html);
|
||||
|
||||
html = "<!DOCTYPE html>\n"
|
||||
+ "<html lang=\"en\">\n"
|
||||
+ "<head>\n"
|
||||
+ "</head>\n"
|
||||
+ "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
|
||||
+ "<svg xmlns=\"http://www.w3.org/2000/svg\">\n"
|
||||
+ " <svg></svg>\n"
|
||||
+ "</svg>\n"
|
||||
+ "</body>\n"
|
||||
+ "</html>";
|
||||
new HtmlCleaner().clean(html);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void svgCloseAssumedNS4() throws Exception{
|
||||
String html="<html><head></head><body><svg><h3>Title</h3><div>text</div></body></html>";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setNamespacesAware(true);
|
||||
props.setOmitXmlDeclaration(true);
|
||||
HtmlCleaner cleaner = new HtmlCleaner(props);
|
||||
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
|
||||
assertEquals("<html><head></head><body><svg></svg><h3>Title</h3><div>text</div></body></html>", cleaned);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore // This is a tricky one as "a" is allowed in SVG, so the rest is assumed to be OK.
|
||||
public void svgCloseAssumedNS3() throws Exception{
|
||||
String html="<html><head></head><body><svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setNamespacesAware(true);
|
||||
props.setOmitXmlDeclaration(true);
|
||||
HtmlCleaner cleaner = new HtmlCleaner(props);
|
||||
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
|
||||
assertEquals("<html><head></head><body><svg></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void svgCloseAssumedNS2() throws Exception{
|
||||
String html="<html><head></head><body><svg><title></title></svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setNamespacesAware(true);
|
||||
props.setOmitXmlDeclaration(true);
|
||||
HtmlCleaner cleaner = new HtmlCleaner(props);
|
||||
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
|
||||
assertEquals("<html><head></head><body><svg><title></title></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void svgCloseAssumedNS() throws Exception{
|
||||
String html="<html><head></head><body><svg></svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
|
||||
CleanerProperties props = new CleanerProperties();
|
||||
props.setNamespacesAware(true);
|
||||
props.setOmitXmlDeclaration(true);
|
||||
HtmlCleaner cleaner = new HtmlCleaner(props);
|
||||
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
|
||||
assertEquals("<html><head></head><body><svg></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void missingSVGNamespace() throws IOException {
|
||||
String initial = "<html><head><title>Title of document</title></head><body><svg><title>A big circle.</title></svg></body></html>";
|
||||
String expected = "<html>\n<head><title>Title of document</title></head>\n<body><svg><title>A big circle.</title></svg></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveSVGtags() throws IOException{
|
||||
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
cleaner.getProperties().setOmitDoctypeDeclaration(false);
|
||||
cleaner.getProperties().setOmitUnknownTags(true);
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
|
||||
String initial = readFile("src/test/resources/test18.html");
|
||||
String expected = readFile("src/test/resources/test18_expected.html");
|
||||
|
||||
assertCleaned(initial,expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveSVGtags2() throws IOException{
|
||||
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
cleaner.getProperties().setOmitDoctypeDeclaration(false);
|
||||
cleaner.getProperties().setOmitUnknownTags(true);
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
|
||||
String initial = readFile("src/test/resources/test19.html");
|
||||
String expected = readFile("src/test/resources/test19_expected.html");
|
||||
assertCleaned(initial,expected);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void preserveSVGtags3() throws IOException{
|
||||
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
cleaner.getProperties().setOmitDoctypeDeclaration(false);
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
|
||||
String initial = readFile("src/test/resources/test20.html");
|
||||
String expected = readFile("src/test/resources/test20_expected.html");
|
||||
|
||||
assertCleaned(initial,expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveSVGtagsWithTitle() throws IOException{
|
||||
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
cleaner.getProperties().setOmitDoctypeDeclaration(false);
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
cleaner.getProperties().setOmitUnknownTags(true);
|
||||
|
||||
String initial = readFile("src/test/resources/test21.html");
|
||||
String expected = readFile("src/test/resources/test21_expected.html");
|
||||
|
||||
assertCleaned(initial,expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void preserveSVGstylesInPlace() throws IOException{
|
||||
|
||||
cleaner.getProperties().setOmitXmlDeclaration(false);
|
||||
cleaner.getProperties().setOmitDoctypeDeclaration(false);
|
||||
cleaner.getProperties().setNamespacesAware(true);
|
||||
cleaner.getProperties().setOmitUnknownTags(true);
|
||||
|
||||
String initial = readFile("src/test/resources/test25.html");
|
||||
String expected = readFile("src/test/resources/test25_expected.html");
|
||||
|
||||
assertCleaned(initial,expected);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,77 @@
|
||||
package org.htmlcleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Tests for some common use of <script> tags within <head> elements
|
||||
* @author scottw
|
||||
*
|
||||
*/
|
||||
public class ScriptTest extends AbstractHtmlCleanerTest {
|
||||
|
||||
@Test
|
||||
public void another() throws IOException{
|
||||
HtmlCleaner htmlCleaner = new HtmlCleaner();
|
||||
CleanerProperties props = htmlCleaner.getProperties();
|
||||
props.setAllowHtmlInsideAttributes(true);
|
||||
props.setAllowMultiWordAttributes(true);
|
||||
props.setRecognizeUnicodeChars(true);
|
||||
props.setOmitComments(true);
|
||||
TagNode rootNode = htmlCleaner.clean(new File("src/test/resources/script_test.html"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void getScripts() throws IOException{
|
||||
HtmlCleaner cleaner = new HtmlCleaner();
|
||||
TagNode html = cleaner.clean( new File("src/test/resources/script_test.html") );
|
||||
TagNode head = html.findElementByName("head", false);
|
||||
|
||||
ArrayList<TagNode> scripts = new ArrayList<TagNode>();
|
||||
List<TagNode> children = head.getChildTagList();
|
||||
|
||||
for(TagNode child : children){
|
||||
if(child.getName().equals("script")){
|
||||
scripts.add(child);
|
||||
}
|
||||
}
|
||||
assertEquals(3, scripts.size());
|
||||
assertEquals("x.js", scripts.get(0).getAttributeByName("src"));
|
||||
assertEquals("y.js", scripts.get(1).getAttributeByName("src"));
|
||||
assertEquals("z.js", scripts.get(2).getAttributeByName("src"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void scriptAttribute() throws IOException{
|
||||
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
|
||||
String initial = "<button onclick='aaa(\"bbb\")'>Click here!</button>";
|
||||
String expected ="<html>\n<head />\n<body><button onclick=\"aaa("bbb")\">Click here!</button></body></html>";
|
||||
assertCleaned(initial, expected);
|
||||
}
|
||||
|
||||
/*
|
||||
* Test for issue #88 - thanks to Serge Dyomin
|
||||
*/
|
||||
@Test
|
||||
public void scriptAttributeQuotes() throws IOException{
|
||||
HtmlCleaner thecleaner=new HtmlCleaner();
|
||||
CleanerProperties props = thecleaner.getProperties();
|
||||
props.setOmitXmlDeclaration(true);
|
||||
props.setOmitComments(false);
|
||||
props.setTranslateSpecialEntities(true);
|
||||
|
||||
String initial = readFile("src/test/resources/test16.html");
|
||||
String expected = readFile("src/test/resources/test16_expected.html");
|
||||
String output = new SimpleHtmlSerializer(thecleaner.getProperties()).getAsString(thecleaner.clean(initial));
|
||||
|
||||
assertEquals(expected,output);
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user