Merge pull request 'merge version 1.' (#1) from master into main

Reviewed-on: https://src.isharkfly.com/honeymoose/HtmlCleaner/pulls/1
This commit is contained in:
2025-04-24 15:36:02 +00:00
182 changed files with 36298 additions and 1 deletions
+12
View File
@@ -0,0 +1,12 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Environment-dependent path to Maven home directory
/mavenHomeManager.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Zeppelin ignored files
/ZeppelinRemoteNotebooks/
+16
View File
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CheckStyle-IDEA" serialisationVersion="2">
<checkstyleVersion>10.23.0</checkstyleVersion>
<scanScope>JavaOnly</scanScope>
<copyLibs>true</copyLibs>
<option name="thirdPartyClasspath" />
<option name="activeLocationIds" />
<option name="locations">
<list>
<ConfigurationLocation id="bundled-sun-checks" type="BUNDLED" scope="All" description="Sun Checks">(bundled)</ConfigurationLocation>
<ConfigurationLocation id="bundled-google-checks" type="BUNDLED" scope="All" description="Google Checks">(bundled)</ConfigurationLocation>
</list>
</option>
</component>
</project>
+13
View File
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="htmlcleaner" />
</profile>
</annotationProcessing>
</component>
</project>
+7
View File
@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>
+35
View File
@@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="sonatype-nexus-snapshots" />
<option name="name" value="Sonatype Nexus Snapshots" />
<option name="url" value="https://oss.sonatype.org/content/repositories/snapshots" />
</remote-repository>
<remote-repository>
<option name="id" value="ossez-repo-releases" />
<option name="name" value="iSharkFly Private Releases" />
<option name="url" value="https://repo.isharkfly.com/repository/isharkfly-maven-releases/" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="https://repo.isharkfly.com/repository/maven/" />
</remote-repository>
<remote-repository>
<option name="id" value="ossez-repo-snapshots" />
<option name="name" value="iSharkFly Private Snapshots" />
<option name="url" value="https://repo.isharkfly.com/repository/isharkfly-maven-snapshots/" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>
Generated
+6
View File
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>
+1 -1
View File
@@ -1,6 +1,6 @@
HtmlCleaner is a project originally developed by Vladimir Nikic (http://htmlcleaner.sourceforge.net/).
This version is modified by Zheng Sun.
This version is modified by iSharkFly.
Briefly speaking, the modifications are
@@ -0,0 +1,44 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
Additional work by Amplafi. -- All rights released.
*/
package org.htmlcleaner;
public interface AttributeTransformation {
boolean satisfy(String attName, String attValue);
String getTemplate();
}
@@ -0,0 +1,72 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
Additional work by Amplafi. -- All rights released.
*/
package org.htmlcleaner;
import java.util.regex.Pattern;
public class AttributeTransformationPatternImpl implements AttributeTransformation {
private final Pattern attNamePattern;
private final Pattern attValuePattern;
private final String template;
public AttributeTransformationPatternImpl(Pattern attNamePattern, Pattern attValuePattern, String template) {
this.attNamePattern = attNamePattern;
this.attValuePattern = attValuePattern;
this.template = template;
}
public AttributeTransformationPatternImpl(String attNamePattern, String attValuePattern, String template) {
this.attNamePattern = attNamePattern ==null?null:Pattern.compile(attNamePattern);
this.attValuePattern = attValuePattern == null? null: Pattern.compile(attValuePattern);
this.template = template;
}
public boolean satisfy(String attName, String attValue) {
if ( (attNamePattern == null || attNamePattern.matcher(attName).find()) && (attValuePattern ==null || attValuePattern.matcher(attValue).find())){
return true;
} else {
return false;
}
}
/**
* @return the template
*/
public String getTemplate() {
return template;
}
}
@@ -0,0 +1,38 @@
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
public class BaseHtmlNode extends BaseTokenImpl implements HtmlNode {
protected TagNode parent;
public List<? extends BaseToken> getSiblings(){
//
// If this is a root node, return an empty list
//
if (this.parent == null) { return new ArrayList<BaseToken>(); };
//
// Otherwise, return all the children, including this node
//
return this.parent.getAllChildren();
}
public TagNode getParent() {
return parent;
}
public void setParent(TagNode parent) {
this.parent = parent;
}
public void serialize(Serializer serializer, Writer writer)
throws IOException {
// TODO Auto-generated method stub
}
}
@@ -0,0 +1,72 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
/**
* <p>
* Base token interface. Tokens are individual entities recognized by HTML parser.
* </p>
*/
public interface BaseToken {
public void serialize(Serializer serializer, Writer writer) throws IOException;
/**
* @return row in source html where the token was found
*/
public int getRow();
/**
* @param row
*/
public void setRow(int row);
/**
* @return col in source html where the token was found
*/
public int getCol();
/**
* @param col
*/
public void setCol(int col);
}
@@ -0,0 +1,40 @@
package org.htmlcleaner;
/**
* Base class for all tokens. Allows position tracking.
*
* @author Konstantin Burov (aectann@gmail.com)
*
*/
public abstract class BaseTokenImpl implements BaseToken {
private int row;
private int col;
protected BaseTokenImpl(){
}
protected BaseTokenImpl(int row, int col) {
this.row = row;
this.col = col;
}
public int getRow() {
return row;
}
public void setRow(int row) {
this.row = row;
}
public int getCol() {
return col;
}
public void setCol(int col) {
this.col = col;
}
@Override
public String toString() {
return "(line="+getRow()+", col="+getCol()+")";
}
}
@@ -0,0 +1,74 @@
/*
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* @author patmoore
*
*/
public enum BelongsTo {
HEAD_AND_BODY("all"),
HEAD("head"),
BODY("body");
private final String dbCode;
private BelongsTo(String dbCode) {
this.dbCode =dbCode;
}
/**
* @return the dbCode
*/
public String getDbCode() {
return dbCode;
}
public static BelongsTo toValue(Object value) {
BelongsTo result = null;
if ( value instanceof BelongsTo) {
result = (BelongsTo) value;
} else if ( value != null ) {
String dbCode = value.toString().trim();
for(BelongsTo belongsTo: BelongsTo.values()) {
if ( belongsTo.getDbCode().equalsIgnoreCase(dbCode) || belongsTo.name().equalsIgnoreCase(dbCode)) {
result = belongsTo;
break;
}
}
}
return result;
}
}
@@ -0,0 +1,152 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
import java.util.StringTokenizer;
/**
* <p>
* Browser compact XML serializer - creates resulting XML by stripping whitespaces wherever possible,
* but preserving single whitespace where at least one exists. This behaviour is well suited
* for web-browsers, which usually treat multiple whitespaces as single one, but make difference
* between single whitespace and empty text.
* </p>
*/
public class BrowserCompactXmlSerializer extends XmlSerializer {
private static final String PRE_TAG = "pre";
private static final String BR_TAG = "<br />";
private static final String LINE_BREAK = "\n";
public BrowserCompactXmlSerializer(CleanerProperties props) {
super(props);
}
@Override
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializeOpenTag(tagNode, writer, false);
TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
String tagName = tagInfo!=null? tagInfo.getName() : null;
List<? extends BaseToken> tagChildren = new ArrayList<BaseToken>(tagNode.getAllChildren());
if (!isMinimizedTagSyntax(tagNode)) {
ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
while (childrenIt.hasNext()) {
Object item = childrenIt.next();
if (item != null) {
if (item instanceof ContentNode && !PRE_TAG.equals(tagName)) {
String content = ((ContentNode) item).getContent();
content = dontEscape(tagNode) ? content.replaceAll("]]>", "]]&gt;") : escapeXml(content);
content = content.replaceAll("^"+SpecialEntities.NON_BREAKABLE_SPACE+"+", " ");
content = content.replaceAll(SpecialEntities.NON_BREAKABLE_SPACE+"+$", " ");
boolean whitespaceAllowed = tagInfo != null && tagInfo.getDisplay().isLeadingAndEndWhitespacesAllowed();
boolean writeLeadingSpace = content.length() > 0 && (Character.isWhitespace(content.charAt(0)));
boolean writeEndingSpace = content.length() > 1 && Character.isWhitespace(content.charAt(content.length() - 1));
content = content.trim();
if (content.length() != 0) {
boolean hasPrevContent = false;
int order = tagChildren.indexOf(item);
if (order >= 2) {
Object prev = tagChildren.get(order-1);
hasPrevContent = isContentOrInline(prev);
}
if (writeLeadingSpace && (whitespaceAllowed || hasPrevContent)) {
writer.write(' ');
}
StringTokenizer tokenizer = new StringTokenizer(content, LINE_BREAK, true);
String prevToken = "";
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (prevToken.equals(token) && prevToken.equals(LINE_BREAK)) {
writer.write(BR_TAG);
prevToken = "";
} else if (LINE_BREAK.equals(token)) {
writer.write(' ');
} else {
writer.write(token.trim());
}
prevToken = token;
}
boolean hasFollowingContent = false;
if (childrenIt.hasNext()) {
Object next = childrenIt.next();
hasFollowingContent = isContentOrInline(next);
childrenIt.previous();
}
if (writeEndingSpace && (whitespaceAllowed || hasFollowingContent)) {
writer.write(' ');
}
} else{
childrenIt.remove();
}
} else if(item instanceof ContentNode){
String content = ((ContentNode) item).getContent();
writer.write(content);
} else if (item instanceof CommentNode) {
String content = ((CommentNode) item).getCommentedContent().trim();
writer.write(content);
} else {
((BaseToken)item).serialize(this, writer);
}
}
}
serializeEndTag(tagNode, writer, tagInfo != null && tagInfo.getDisplay().isAfterTagLineBreakNeeded());
}
}
private boolean isContentOrInline(Object node) {
boolean result = false;
if (node instanceof ContentNode) {
result = true;
} else if (node instanceof TagNode) {
TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName());
result = nextInfo != null && nextInfo.getDisplay() == Display.inline;
}
return result;
}
}
+75
View File
@@ -0,0 +1,75 @@
/* Copyright (c) 2006-2013, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
public class CData extends ContentNode implements HtmlNode {
public static final String BEGIN_CDATA = "<![CDATA[";
public static final String END_CDATA = "]]>";
public static final String SAFE_BEGIN_CDATA = "/*<![CDATA[*/";
public static final String SAFE_END_CDATA = "/*]]>*/";
public static final String SAFE_BEGIN_CDATA_ALT = "//<![CDATA[";
public static final String SAFE_END_CDATA_ALT = "//]]>";
public CData(String content){
super(content);
}
public String getContentWithoutStartAndEndTokens(){
return this.content;
}
/* (non-Javadoc)
* @see org.htmlcleaner.ContentNode#getContent()
*/
@Override
public String getContent() {
return getContentWithoutStartAndEndTokens();
}
/* (non-Javadoc)
* @see org.htmlcleaner.ContentNode#toString()
*/
@Override
public String toString() {
return getContentWithStartAndEndTokens();
}
public String getContentWithStartAndEndTokens(){
return SAFE_BEGIN_CDATA + this.content + SAFE_END_CDATA;
}
}
@@ -0,0 +1,54 @@
package org.htmlcleaner;
import java.util.Stack;
/**
* Contains information about nodes that were closed due to their child nodes.
* i.e. if 'p' tag was closed due to 'table' child tag.
*
* @author Konstantin Burov
*
*/
class ChildBreaks{
Stack < TagPos> closedByChildBreak = new Stack < TagPos >();
private Stack < TagPos > breakingTags = new Stack < TagPos >();
/**
* Adds the break info to the top of the stacks.
*
* @param closedPos - position of the tag that was closed due to incorrect child
* @param breakPos - position of the child that has broken its parent
*/
public void addBreak(TagPos closedPos, TagPos breakPos){
closedByChildBreak.add(closedPos);
breakingTags.add(breakPos);
}
public boolean isEmpty() {
return closedByChildBreak.isEmpty();
}
/**
* @return name of the last children tag that has broken its parent.
*/
public String getLastBreakingTag() {
return breakingTags.peek().name;
}
/**
* pops out latest broken tag position.
*
* @return tag pos of the last parent that was broken.
*/
public TagPos pop() {
breakingTags.pop();
return closedByChildBreak.pop();
}
/**
* @return position of the last tag that has broken its parent. -1 if no such tag found.
*/
public int getLastBreakingTagPosition() {
return breakingTags.isEmpty()?-1:breakingTags.peek().position;
}
}
@@ -0,0 +1,80 @@
/* Copyright (c) 2006-2013, HtmlCleaner Team (Vladimir Nikic, Pat Moore, Scott Wilson)
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.Stack;
import java.util.TreeSet;
import org.htmlcleaner.conditional.ITagNodeCondition;
/**
* This class is for thread-safe handling of private instance variables from HtmlCleaner
*/
class CleanTimeValues {
boolean _headOpened = false;
boolean _bodyOpened = false;
@SuppressWarnings("rawtypes")
Set _headTags = new LinkedHashSet();
@SuppressWarnings("rawtypes")
Set allTags = new TreeSet();
transient Stack<NestingState> nestingStates = new Stack<NestingState>();
TagNode htmlNode;
TagNode bodyNode;
TagNode headNode;
TagNode rootNode;
Set<ITagNodeCondition> pruneTagSet = new HashSet<ITagNodeCondition>();
Set<TagNode> pruneNodeSet = new HashSet<TagNode>();
Set<ITagNodeCondition> allowTagSet;
/**
* A stack of namespaces for currently open tags. Every xmlns declaration
* on a tag adds another namespace to the stack, which is removed when the
* tag is closed. In this way you can keep track of what namespace a tag
* belongs to.
*/
transient Stack<String> namespace = new Stack<String>();
/**
* A map of all the namespace prefixes and URIs declared within the document.
* We use this to check whether any prefixes remain undeclared.
*/
transient HashMap<String, String> namespaceMap = new HashMap<String, String>();
}
@@ -0,0 +1,665 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.htmlcleaner.audit.ErrorType;
import org.htmlcleaner.audit.HtmlModificationListener;
import org.htmlcleaner.conditional.ITagNodeCondition;
import org.htmlcleaner.conditional.TagNodeAutoGeneratedCondition;
import org.htmlcleaner.conditional.TagNodeNameCondition;
/**
* Properties defining cleaner's behaviour
*/
public class CleanerProperties implements HtmlModificationListener{
// Force consistent cross-platform encoding ( mandatory for reliable server operation)
public static final String DEFAULT_CHARSET = "UTF-8";
public static final String BOOL_ATT_SELF = "self";
public static final String BOOL_ATT_EMPTY = "empty";
public static final String BOOL_ATT_TRUE = "true";
private ITagInfoProvider tagInfoProvider;
/**
* If this parameter is set to true, ampersand sign (&) that proceeds valid XML character sequences (&XXX;) will not be escaped with &amp;XXX;
*/
private boolean advancedXmlEscape;
private String useCdataFor;
private List<String> useCdataForList;
private boolean translateSpecialEntities;
private boolean recognizeUnicodeChars;
private boolean omitUnknownTags;
private boolean treatUnknownTagsAsContent;
private boolean omitDeprecatedTags;
private boolean omitComments;
private boolean treatDeprecatedTagsAsContent;
private OptionalOutput omitXmlDeclaration;
private OptionalOutput omitDoctypeDeclaration;
private OptionalOutput omitHtmlEnvelope;
private boolean useEmptyElementTags;
private boolean allowMultiWordAttributes;
private String booleanAttributeValues;
private boolean ignoreQuestAndExclam;
private boolean allowHtmlInsideAttributes;
private boolean namespacesAware;
private boolean transSpecialEntitiesToNCR;
private boolean omitCdataOutsideScriptAndStyle;
private boolean deserializeEntities;
private boolean trimAttributeValues;
private int htmlVersion;
private boolean allowInvalidAttributeNames;
private String invalidAttributeNamePrefix;
/**
* Provides an arbitrary recursion depth
*/
private int maxDepth;
public int getMaxDepth() {
return maxDepth;
}
public void setMaxDepth(int maxDepth) {
this.maxDepth = maxDepth;
}
/**
* "cause the cleaner cannot keep track of whitespace at that level",
* there are 2 lists built: one for the head , one for the body. So whitespace that falls outside of the head and body is not preserved
* this creates at least a newline break.
*
* More work than really wanted at this point to "preserve" the whitespace.
*/
private boolean addNewlineToHeadAndBody;
/**
* Tries to keep inside head all whitespace and comments that were originally there
*/
private boolean keepWhitespaceAndCommentsInHead;
private String hyphenReplacementInComment;
// comma separate list of tags pruned.
private String pruneTags;
// comma separate list of tags allowed.
private String allowTags;
private CleanerTransformations cleanerTransformations = new CleanerTransformations();
private List < HtmlModificationListener > htmlModificationListeners;
/**
* blacklist of tags
*/
private Set<ITagNodeCondition> pruneTagSet = new HashSet<ITagNodeCondition>();
/**
* the list of allowed tags (whitelist approach v. blacklist approach of pruneTags )
*/
private Set<ITagNodeCondition> allowTagSet = new HashSet<ITagNodeCondition>();
private String charset = DEFAULT_CHARSET;
private boolean transResCharsToNCR;
public CleanerProperties() {
reset();
}
/**
* @param tagInfoProvider
*/
public CleanerProperties(ITagInfoProvider tagInfoProvider) {
reset();
this.tagInfoProvider = tagInfoProvider;
}
/**
* @param tagInfoProvider the tagInfoProvider to set
*/
void setTagInfoProvider(ITagInfoProvider tagInfoProvider) {
this.tagInfoProvider = tagInfoProvider;
}
public ITagInfoProvider getTagInfoProvider() {
return tagInfoProvider;
}
public boolean isAdvancedXmlEscape() {
return advancedXmlEscape;
}
public void setAdvancedXmlEscape(boolean advancedXmlEscape) {
this.advancedXmlEscape = advancedXmlEscape;
}
public boolean isTransResCharsToNCR() {
return transResCharsToNCR;
}
public void setTransResCharsToNCR(boolean transResCharsToNCR) {
this.transResCharsToNCR = transResCharsToNCR;
}
public boolean isUseCdataForScriptAndStyle() {
return isUseCdataFor("script") && isUseCdataFor("style");
}
public void setUseCdataForScriptAndStyle(boolean useCdataForScriptAndStyle) {
if (useCdataForScriptAndStyle)
setUseCdataFor("script,style");
else
setUseCdataFor("");
}
public void setUseCdataFor(String useCdataFor) {
if (useCdataFor != null) {
this.useCdataFor = useCdataFor;
this.useCdataForList = Arrays.asList(useCdataFor.toLowerCase().split(","));
} else {
this.useCdataFor = "";
this.useCdataForList = null;
}
}
public String getUseCdataFor() {
return this.useCdataFor;
}
public boolean isUseCdataFor(String useCdataFor) {
if (useCdataForList != null && useCdataFor != null)
return useCdataForList.contains(useCdataFor.toLowerCase());
else
return false;
}
public boolean isTranslateSpecialEntities() {
return translateSpecialEntities;
}
/**
* TODO : use {@link OptionalOutput}
* @param translateSpecialEntities
*/
public void setTranslateSpecialEntities(boolean translateSpecialEntities) {
this.translateSpecialEntities = translateSpecialEntities;
}
public boolean isRecognizeUnicodeChars() {
return recognizeUnicodeChars;
}
public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) {
this.recognizeUnicodeChars = recognizeUnicodeChars;
}
public boolean isOmitUnknownTags() {
return omitUnknownTags;
}
public void setOmitUnknownTags(boolean omitUnknownTags) {
this.omitUnknownTags = omitUnknownTags;
}
public boolean isTreatUnknownTagsAsContent() {
return treatUnknownTagsAsContent;
}
public void setTreatUnknownTagsAsContent(boolean treatUnknownTagsAsContent) {
this.treatUnknownTagsAsContent = treatUnknownTagsAsContent;
}
public boolean isOmitDeprecatedTags() {
return omitDeprecatedTags;
}
public void setOmitDeprecatedTags(boolean omitDeprecatedTags) {
this.omitDeprecatedTags = omitDeprecatedTags;
}
public boolean isTreatDeprecatedTagsAsContent() {
return treatDeprecatedTagsAsContent;
}
public void setTreatDeprecatedTagsAsContent(boolean treatDeprecatedTagsAsContent) {
this.treatDeprecatedTagsAsContent = treatDeprecatedTagsAsContent;
}
public boolean isOmitComments() {
return omitComments;
}
public void setOmitComments(boolean omitComments) {
this.omitComments = omitComments;
}
public boolean isOmitXmlDeclaration() {
return omitXmlDeclaration == OptionalOutput.omit;
}
public void setOmitXmlDeclaration(boolean omitXmlDeclaration) {
this.omitXmlDeclaration = omitXmlDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput;
}
/**
*
* @return also return true if omitting the Html Envelope
*/
public boolean isOmitDoctypeDeclaration() {
return omitDoctypeDeclaration == OptionalOutput.omit || isOmitHtmlEnvelope();
}
public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) {
this.omitDoctypeDeclaration = omitDoctypeDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput;
}
public boolean isOmitHtmlEnvelope() {
return omitHtmlEnvelope == OptionalOutput.omit;
}
public void setOmitHtmlEnvelope(boolean omitHtmlEnvelope) {
this.omitHtmlEnvelope = omitHtmlEnvelope?OptionalOutput.omit:OptionalOutput.alwaysOutput;
}
public boolean isUseEmptyElementTags() {
return useEmptyElementTags;
}
public void setUseEmptyElementTags(boolean useEmptyElementTags) {
this.useEmptyElementTags = useEmptyElementTags;
}
public boolean isAllowMultiWordAttributes() {
return allowMultiWordAttributes;
}
public void setAllowMultiWordAttributes(boolean allowMultiWordAttributes) {
this.allowMultiWordAttributes = allowMultiWordAttributes;
}
public boolean isAllowHtmlInsideAttributes() {
return allowHtmlInsideAttributes;
}
public void setAllowHtmlInsideAttributes(boolean allowHtmlInsideAttributes) {
this.allowHtmlInsideAttributes = allowHtmlInsideAttributes;
}
public boolean isIgnoreQuestAndExclam() {
return ignoreQuestAndExclam;
}
public void setIgnoreQuestAndExclam(boolean ignoreQuestAndExclam) {
this.ignoreQuestAndExclam = ignoreQuestAndExclam;
}
public boolean isNamespacesAware() {
return namespacesAware;
}
public void setNamespacesAware(boolean namespacesAware) {
this.namespacesAware = namespacesAware;
}
public boolean isAddNewlineToHeadAndBody() {
return addNewlineToHeadAndBody;
}
public void setAddNewlineToHeadAndBody(boolean addNewlineToHeadAndBody) {
this.addNewlineToHeadAndBody = addNewlineToHeadAndBody;
}
public boolean isKeepWhitespaceAndCommentsInHead() {
return keepWhitespaceAndCommentsInHead;
}
public void setKeepWhitespaceAndCommentsInHead(boolean keepHeadWhitespace) {
this.keepWhitespaceAndCommentsInHead = keepHeadWhitespace;
}
public String getHyphenReplacementInComment() {
return hyphenReplacementInComment;
}
public void setHyphenReplacementInComment(String hyphenReplacementInComment) {
this.hyphenReplacementInComment = hyphenReplacementInComment;
}
public String getPruneTags() {
return pruneTags;
}
public boolean isOmitCdataOutsideScriptAndStyle(){
return omitCdataOutsideScriptAndStyle;
}
public void setOmitCdataOutsideScriptAndStyle(boolean value){
omitCdataOutsideScriptAndStyle = value;
}
public boolean isDeserializeEntities() {
return deserializeEntities;
}
public void setDeserializeEntities(boolean deserializeEntities) {
this.deserializeEntities = deserializeEntities;
}
/**
* Sets the html version according to the parameter.Also,it sets the
* tag provider to the appropriate version.
*
* @param version Number 4 for html4 or 5 for html5
*/
public void setHtmlVersion(int version){
this.htmlVersion=version;
if (version==4)
this.setTagInfoProvider(Html4TagProvider.INSTANCE);
else
this.setTagInfoProvider(Html5TagProvider.INSTANCE);
}
/**
* Return the html version
* @return int The html version
*/
public int getHtmlVersion (){
return this.htmlVersion;
}
public boolean isTrimAttributeValues() {
return trimAttributeValues;
}
public void setTrimAttributeValues(boolean trimAttributeValues) {
this.trimAttributeValues = trimAttributeValues;
}
/**
* Resets prune tags set and adds tag name conditions to it.
* All the tags listed by pruneTags param are added.
*
* @param pruneTags
*/
public void setPruneTags(String pruneTags) {
this.pruneTags = pruneTags;
this.resetPruneTagSet();
this.addTagNameConditions(this.pruneTagSet, pruneTags);
}
/**
* Adds the condition to existing prune tag set.
*
* @param condition
*/
public void addPruneTagNodeCondition(ITagNodeCondition condition){
pruneTagSet.add(condition);
}
public Set<ITagNodeCondition> getPruneTagSet() {
return pruneTagSet;
}
public String getAllowTags() {
return allowTags;
}
public void setAllowTags(String allowTags) {
this.allowTags = allowTags;
this.setAllowTagSet(allowTags);
}
private void setAllowTagSet(String allowTags) {
allowTagSet.clear();
addTagNameConditions(allowTagSet, allowTags);
}
public boolean isTransSpecialEntitiesToNCR() {
return transSpecialEntitiesToNCR;
}
public void setTransSpecialEntitiesToNCR(boolean transSpecialEntitiesToNCR) {
this.transSpecialEntitiesToNCR = transSpecialEntitiesToNCR;
}
/**
* @param tagSet
* @param tagsNameStr
*/
private void addTagNameConditions(Set<ITagNodeCondition> tagSet, String tagsNameStr) {
if (tagsNameStr != null) {
StringTokenizer tokenizer = new StringTokenizer(tagsNameStr, ",");
while ( tokenizer.hasMoreTokens() ) {
tagSet.add( new TagNodeNameCondition(tokenizer.nextToken().trim().toLowerCase()) );
}
}
}
public Set<ITagNodeCondition> getAllowTagSet() {
return allowTagSet;
}
/**
* @param charset the charset to set
*/
public void setCharset(String charset) {
this.charset = charset;
}
/**
* @return the charset
*/
public String getCharset() {
return charset;
}
public String getBooleanAttributeValues() {
return booleanAttributeValues;
}
public void setBooleanAttributeValues(String booleanAttributeValues) {
if ( BOOL_ATT_SELF.equalsIgnoreCase(booleanAttributeValues) ||
BOOL_ATT_EMPTY.equalsIgnoreCase(booleanAttributeValues) ||
BOOL_ATT_TRUE.equalsIgnoreCase(booleanAttributeValues) ) {
this.booleanAttributeValues = booleanAttributeValues.toLowerCase();
} else {
this.booleanAttributeValues = BOOL_ATT_SELF;
}
}
/**
* advancedXmlEscape = true;
* setUseCdataFor("script,style");
* translateSpecialEntities = true;
* recognizeUnicodeChars = true;
* omitUnknownTags = false;
* treatUnknownTagsAsContent = false;
* omitDeprecatedTags = false;
* treatDeprecatedTagsAsContent = false;
* omitComments = false;
* omitXmlDeclaration = OptionalOutput.alwaysOutput;
* omitDoctypeDeclaration = OptionalOutput.alwaysOutput;
* omitHtmlEnvelope = OptionalOutput.alwaysOutput;
* useEmptyElementTags = true;
* allowMultiWordAttributes = true;
* allowHtmlInsideAttributes = false;
* ignoreQuestAndExclam = true;
* namespacesAware = true;
* keepHeadWhitespace = true;
* addNewlineToHeadAndBody = true;
* hyphenReplacementInComment = "=";
* pruneTags = null;
* allowTags = null;
* booleanAttributeValues = BOOL_ATT_SELF;
* collapseNullHtml = CollapseHtml.none
* charset = "UTF-8";
* trimAttributeValues = true;
* tagInfoProvider = HTML5TagProvider.INSTANCE
* maxDepth = 1000
*/
public void reset() {
advancedXmlEscape = true;
setUseCdataFor("script,style");
translateSpecialEntities = true;
recognizeUnicodeChars = true;
omitUnknownTags = false;
treatUnknownTagsAsContent = false;
omitDeprecatedTags = false;
treatDeprecatedTagsAsContent = false;
omitComments = false;
omitXmlDeclaration = OptionalOutput.alwaysOutput;
omitDoctypeDeclaration = OptionalOutput.alwaysOutput;
omitHtmlEnvelope = OptionalOutput.alwaysOutput;
useEmptyElementTags = true;
allowMultiWordAttributes = true;
allowHtmlInsideAttributes = false;
ignoreQuestAndExclam = true;
namespacesAware = true;
addNewlineToHeadAndBody = true;
keepWhitespaceAndCommentsInHead = true;
hyphenReplacementInComment = "=";
setPruneTags(null);
setAllowTags(null);
booleanAttributeValues = BOOL_ATT_SELF;
charset = "UTF-8";
cleanerTransformations.clear();
resetPruneTagSet();
if (this.getHtmlVersion()==HtmlCleaner.HTML_4){
tagInfoProvider = Html4TagProvider.INSTANCE;
}
else{
tagInfoProvider = Html5TagProvider.INSTANCE;
}
htmlModificationListeners = new ArrayList < HtmlModificationListener >();
omitCdataOutsideScriptAndStyle = false;
trimAttributeValues = true;
invalidAttributeNamePrefix = "";
allowInvalidAttributeNames = false;
maxDepth = 1000;
}
private void resetPruneTagSet() {
pruneTagSet.clear();
pruneTagSet.add(TagNodeAutoGeneratedCondition.INSTANCE);
}
/**
* @return the cleanerTransformations
*/
public CleanerTransformations getCleanerTransformations() {
return cleanerTransformations;
}
public void setCleanerTransformations(CleanerTransformations cleanerTransformations) {
if ( cleanerTransformations == null ) {
this.cleanerTransformations.clear();
} else {
this.cleanerTransformations = cleanerTransformations;
}
}
/**
* Adds a listener to the list of objects that will be notified about changes that
* cleaner does during cleanup process.
*
* @param listener -- listener object to be notified of the changes.
*/
public void addHtmlModificationListener(HtmlModificationListener listener){
htmlModificationListeners.add(listener);
}
public void fireConditionModification(ITagNodeCondition condition, TagNode tagNode) {
for (HtmlModificationListener listener : htmlModificationListeners) {
listener.fireConditionModification(condition, tagNode);
}
}
public void fireHtmlError(boolean certainty, TagNode startTagToken, ErrorType type) {
for (HtmlModificationListener listener : htmlModificationListeners) {
listener.fireHtmlError(certainty, startTagToken, type);
}
}
public void fireUglyHtml(boolean certainty, TagNode startTagToken, ErrorType errorType) {
for (HtmlModificationListener listener : htmlModificationListeners) {
listener.fireUglyHtml(certainty, startTagToken, errorType);
}
}
public void fireUserDefinedModification(boolean certainty, TagNode tagNode, ErrorType errorType) {
for (HtmlModificationListener listener : htmlModificationListeners) {
listener.fireUserDefinedModification(certainty, tagNode, errorType);
}
}
/**
* Get the prefix to use to try to make valid attribute names
* @return invalidAttributeNamePrefix
*/
public String getInvalidXmlAttributeNamePrefix() {
return invalidAttributeNamePrefix;
}
/**
* Sets the prefix to use for xml attributes that are invalid
* @param invalidXmlAttributePrefix the prefix to use
*/
public void setInvalidXmlAttributeNamePrefix(
String invalidXmlAttributePrefix) {
this.invalidAttributeNamePrefix = invalidXmlAttributePrefix;
}
/**
* Set whether to allow invalid attribute names, or to try to fix or omit them
* @param allowInvalidAttributeNames True if invalid attributes allowed
*/
public void setAllowInvalidAttributeNames(
boolean allowInvalidAttributeNames) {
this.allowInvalidAttributeNames = allowInvalidAttributeNames;
}
/**
* If false, when outputting XML, if an attribute name is not valid, attempt to
* fix it by using a prefix and removing invalid characters. Otherwise, omit invalid attributes
* @return True if invalid attribute names are allowed.
*/
public boolean isAllowInvalidAttributeNames() {
return allowInvalidAttributeNames;
}
}
@@ -0,0 +1,149 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* Contains transformation collection.
*/
public class CleanerTransformations {
private Map mappings = new HashMap();
private TagTransformation globalTransformations=new TagTransformation();
public CleanerTransformations() {
}
/**
* @param transInfos
*/
public CleanerTransformations(Map transInfos) {
updateTagTransformations(transInfos);
}
/**
* Adds specified tag transformation to the collection.
* @param tagTransformation
*/
public void addTransformation(TagTransformation tagTransformation) {
if (tagTransformation != null) {
mappings.put( tagTransformation.getSourceTag(), tagTransformation );
}
}
public void addGlobalTransformation(AttributeTransformation attributeTransformation) {
globalTransformations.addAttributePatternTransformation(attributeTransformation);
}
public boolean hasTransformationForTag(String tagName) {
return tagName != null && mappings.containsKey(tagName.toLowerCase());
}
public TagTransformation getTransformation(String tagName) {
return tagName != null ? (TagTransformation) mappings.get(tagName.toLowerCase()) : null;
}
public void updateTagTransformations(String key, String value) {
int index = key.indexOf('.');
// new tag transformation case (tagname[=destname[,preserveatts]])
if (index <= 0) {
String destTag = null;
boolean preserveSourceAtts = true;
if (value != null) {
String[] tokens = Utils.tokenize(value, ",;");
if (tokens.length > 0) {
destTag = tokens[0];
}
if (tokens.length > 1) {
preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) ||
"yes".equalsIgnoreCase(tokens[1]) ||
"1".equals(tokens[1]);
}
}
TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts);
addTransformation(newTagTrans);
} else { // attribute transformation description
String[] parts = Utils.tokenize(key, ".");
String tagName = parts[0];
TagTransformation trans = getTransformation(tagName);
if (trans != null) {
trans.addAttributeTransformation(parts[1], value);
}
}
}
public void updateTagTransformations(Map transInfos) {
Iterator iterator = transInfos.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry entry = (Map.Entry) iterator.next();
String tag = (String) entry.getKey();
String value = (String) entry.getValue();
updateTagTransformations(tag, value);
}
}
public Map<String, String> transformAttributes(String originalTagName, Map<String, String> attributes) {
TagTransformation tagTrans = getTransformation(originalTagName);
Map<String, String> results;
if ( tagTrans != null ) {
results = tagTrans.applyTagTransformations(attributes);
} else {
results = attributes;
}
return this.globalTransformations.applyTagTransformations(results);
}
public String getTagName(String tagName) {
TagTransformation tagTransformation = null;
if (hasTransformationForTag(tagName)) {
tagTransformation = getTransformation(tagName);
if (tagTransformation != null) {
return tagTransformation.getDestTag();
}
}
return tagName;
}
/**
*
*/
public void clear() {
this.mappings.clear();
}
}
@@ -0,0 +1,83 @@
/*
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
Additional work by Amplafi. -- All rights released.
*/
package org.htmlcleaner;
/**
* @author patmoore
*
*/
public enum CloseTag {
/**
* <div></div> is required. Minimizing to <div/> is not permitted.
*/
required(false, true),
/**
* <hr> or <hr/> is permitted
*/
optional(true, true),
/**
* <img/> is not permitted
*/
forbidden(true, false);
private final boolean minimizedTagPermitted;
private final boolean endTagPermitted;
/**
*
* @param minimizedTagPermitted if true tag can be reduced to <x/>
* @param endTagPermitted TODO
*/
private CloseTag(boolean minimizedTagPermitted, boolean endTagPermitted) {
this.minimizedTagPermitted = minimizedTagPermitted;
this.endTagPermitted =endTagPermitted;
}
/**
* @return true if <x/> form is allowed
*/
public boolean isMinimizedTagPermitted() {
return this.minimizedTagPermitted;
}
/**
* @return true if <x/> or </x> is permitted.
*/
public boolean isEndTagPermitted() {
return endTagPermitted;
}
}
@@ -0,0 +1,384 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.FileOutputStream;
import java.net.URL;
import java.util.Map;
import java.util.Scanner;
import java.util.TreeMap;
import java.util.logging.Logger;
import org.htmlcleaner.audit.HtmlModificationListenerLogger;
/**
* <p>Command line usage class.</p>
*/
public class CommandLine {
private static final String OMITXMLDECL = "omitxmldecl";
/**
* If the specified argument name exists without a value, return true.
* If it exists with a value, translate it as a boolean.
* @param args the command line arguments
* @param name the switch name
* @return true, or false, depending on whether the switch has been specified
*/
private static boolean getSwitchArgument(String[] args, String name){
boolean value = false;
for (String curr : args){
int eqIndex = curr.indexOf('=');
if (eqIndex >= 0) {
String argName = curr.substring(0, eqIndex).trim();
String argValue = curr.substring(eqIndex+1).trim();
if (argName.toLowerCase().startsWith(name.toLowerCase())) {
value = toBoolean(argValue);
}
} else {
value = true;
}
}
return value;
}
private static String getArgValue(String[] args, String name, String defaultValue) {
for (String curr : args) {
int eqIndex = curr.indexOf('=');
if (eqIndex >= 0) {
String argName = curr.substring(0, eqIndex).trim();
String argValue = curr.substring(eqIndex+1).trim();
if (argName.toLowerCase().startsWith(name.toLowerCase())) {
return argValue;
}
}
}
return defaultValue;
}
private static boolean toBoolean(String s) {
return s != null && ( "on".equalsIgnoreCase(s) || "true".equalsIgnoreCase(s) || "yes".equalsIgnoreCase(s) );
}
private final static String className = CommandLine.class.getName();
private final static Logger logger = Logger.getLogger(className);
public static void main(String[] args) throws IOException, XPatherException {
String source = getArgValue(args, "src", "");
Scanner scan = new Scanner(System.in);
String s = "";
if ( "".equals(source) ) {
while (scan.hasNext()) {
s += scan.nextLine();
}
if (s.compareTo("") != 0) {
System.err.println("Output:");
} else {
System.err.println("Usage: java -jar htmlcleanerXX.jar src=<url | file> [htmlver=4] [incharset=<charset>] " +
"[dest=<file>] [outcharset=<charset>] [taginfofile=<file>] [options...]");
System.err.println("Alternative: java -jar htmlcleanerXX.jar (reads the input from console)");
System.err.println("");
System.err.println("where options include:");
System.err.println(" outputtype=simple* | compact | browser-compact | pretty");
System.err.println(" advancedxmlescape=true* | false");
System.err.println(" usecdata=true* | false");
System.err.println(" usecdatafor=<string value> [script,style]");
System.err.println(" specialentities=true* | false");
System.err.println(" unicodechars=true* | false");
System.err.println(" omitunknowntags=true | false*");
System.err.println(" treatunknowntagsascontent=true | false*");
System.err.println(" omitdeprtags=true | false*");
System.err.println(" treatdeprtagsascontent=true | false*");
System.err.println(" omitcomments=true | false*");
System.err.println(" " +OMITXMLDECL +"=true* | false");
System.err.println(" omitdoctypedecl=true* | false");
System.err.println(" omithtmlenvelope=true | false*");
System.err.println(" useemptyelementtags=true* | false");
System.err.println(" allowmultiwordattributes=true* | false");
System.err.println(" allowhtmlinsideattributes=true | false*");
System.err.println(" ignoreqe=true | false*");
System.err.println(" namespacesaware=true* | false");
System.err.println(" hyphenreplacement=<string value> [=]");
System.err.println(" prunetags=<string value> []");
System.err.println(" booleanatts=self* | empty | true");
System.err.println(" nodebyxpath=<xpath expression>");
System.err.println(" allowinvalidxmlattributenames=true | false*");
System.err.println(" invalidxmlattributenameprefix=<string value> []");
System.err.println(" t:<sourcetagX>[=<desttag>[,<preserveatts>]]");
System.err.println(" t:<sourcetagX>.<destattrY>[=<template>]");
System.exit(1);
}
}
String inCharset = getArgValue(args, "incharset", "");
if ("".equals(inCharset)) {
inCharset = CleanerProperties.DEFAULT_CHARSET;
}
String outCharset = getArgValue(args, "outcharset", "");
if ("".equals(outCharset)) {
outCharset = CleanerProperties.DEFAULT_CHARSET;
}
String htmlversion = getArgValue(args, "htmlver", "");
String destination = getArgValue(args, "dest", "");
String outputType = getArgValue(args, "outputtype", "");
String advancedXmlEscape = getArgValue(args, "advancedxmlescape", "");
String useCData = getArgValue(args, "usecdata", "");
String useCDataFor = getArgValue(args, "usecdatafor", "");
String translateSpecialEntities = getArgValue(args, "specialentities", "");
String unicodeChars = getArgValue(args, "unicodechars", "");
String omitUnknownTags = getArgValue(args, "omitunknowntags", "");
String treatUnknownTagsAsContent = getArgValue(args, "treatunknowntagsascontent", "");
String omitDeprecatedTags = getArgValue(args, "omitdeprtags", "");
String treatDeprecatedTagsAsContent = getArgValue(args, "treatdeprtagsascontent", "");
String omitComments = getArgValue(args, "omitcomments", "");
String omitXmlDeclaration = getArgValue(args, OMITXMLDECL, "");
String omitDoctypeDeclaration = getArgValue(args, "omitdoctypedecl", "");
String omitHtmlEnvelope = getArgValue(args, "omithtmlenvelope", "");
String useEmptyElementTags = getArgValue(args, "useemptyelementtags", "");
String allowMultiWordAttributes = getArgValue(args, "allowmultiwordattributes", "");
String allowHtmlInsideAttributes = getArgValue(args, "allowhtmlinsideattributes", "");
String ignoreQuestAndExclam = getArgValue(args, "ignoreqe", "");
String namespacesAware= getArgValue(args, "namespacesaware", "");
String commentHyphen = getArgValue(args, "hyphenreplacement", "");
String pruneTags = getArgValue(args, "prunetags", "");
String booleanAtts = getArgValue(args, "booleanatts", "");
String nodeByXPath = getArgValue(args, "nodebyxpath", "");
String allowInvalidAttributeNames = getArgValue(args, "allowinvalidattributenames", "");
String invalidXmlAttributeNamePrefix = getArgValue(args, "invalidxmlattributenameprefix", "");
HtmlCleaner cleaner;
String tagInfoFile = getArgValue(args, "taginfofile", "");
if ( !"".equals(tagInfoFile) ) {
cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(tagInfoFile)));
} else { //Set appropriate TagProvider
if (htmlversion.compareTo("4")==0)
cleaner = new HtmlCleaner(Html4TagProvider.INSTANCE);
else
cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE);
}
final CleanerProperties props = cleaner.getProperties();
//
// If the user specifies "quiet" or "quiet=true" then we don't add a modification
// listener
//
if (!getSwitchArgument(args, "quiet"))
props.addHtmlModificationListener(new HtmlModificationListenerLogger(logger));
if ( !"".equals(omitUnknownTags) ) {
props.setOmitUnknownTags( toBoolean(omitUnknownTags) );
}
if ( !"".equals(treatUnknownTagsAsContent) ) {
props.setTreatUnknownTagsAsContent( toBoolean(treatUnknownTagsAsContent) );
}
if ( !"".equals(omitDeprecatedTags) ) {
props.setOmitDeprecatedTags( toBoolean(omitDeprecatedTags) );
}
if ( !"".equals(treatDeprecatedTagsAsContent) ) {
props.setTreatDeprecatedTagsAsContent( toBoolean(treatDeprecatedTagsAsContent) );
}
if ( !"".equals(advancedXmlEscape) ) {
props.setAdvancedXmlEscape( toBoolean(advancedXmlEscape) );
}
if ( !"".equals(useCData) && "".equals(useCDataFor) ) {
props.setUseCdataForScriptAndStyle( toBoolean(useCData) );
}
if ( !"".equals(useCDataFor) ) {
props.setUseCdataFor( useCDataFor );
}
if ( !"".equals(translateSpecialEntities) ) {
props.setTranslateSpecialEntities( toBoolean(translateSpecialEntities) );
}
if ( !"".equals(unicodeChars) ) {
props.setRecognizeUnicodeChars( toBoolean(unicodeChars) );
}
if ( !"".equals(omitComments) ) {
props.setOmitComments( toBoolean(omitComments) );
}
if ( !"".equals(omitXmlDeclaration) ) {
props.setOmitXmlDeclaration( toBoolean(omitXmlDeclaration) );
}
if ( !"".equals(omitDoctypeDeclaration) ) {
props.setOmitDoctypeDeclaration( toBoolean(omitDoctypeDeclaration) );
}
if ( !"".equals(omitHtmlEnvelope) ) {
props.setOmitHtmlEnvelope( toBoolean(omitHtmlEnvelope) );
}
if ( !"".equals(useEmptyElementTags) ) {
props.setUseEmptyElementTags( toBoolean(useEmptyElementTags) );
}
if ( !"".equals(allowMultiWordAttributes) ) {
props.setAllowMultiWordAttributes( toBoolean(allowMultiWordAttributes) );
}
if ( !"".equals(allowHtmlInsideAttributes) ) {
props.setAllowHtmlInsideAttributes( toBoolean(allowHtmlInsideAttributes) );
}
if ( !"".equals(ignoreQuestAndExclam) ) {
props.setIgnoreQuestAndExclam( toBoolean(ignoreQuestAndExclam) );
}
if ( !"".equals(namespacesAware) ) {
props.setNamespacesAware( toBoolean(namespacesAware) );
}
if ( !"".equals(commentHyphen) ) {
props.setHyphenReplacementInComment(commentHyphen);
}
if ( !"".equals(pruneTags) ) {
props.setPruneTags(pruneTags);
}
if ( !"".equals(booleanAtts) ) {
props.setBooleanAttributeValues(booleanAtts);
}
if ( !"".equals(allowInvalidAttributeNames) ) {
props.setAllowInvalidAttributeNames( toBoolean(allowInvalidAttributeNames) );
}
if ( !"".equals(invalidXmlAttributeNamePrefix) ) {
props.setInvalidXmlAttributeNamePrefix( invalidXmlAttributeNamePrefix );
}
// collect transformation info
Map transInfos = new TreeMap();
for (String arg2 : args) {
String arg = arg2;
if (arg.startsWith("t:") && arg.length() > 2) {
arg = arg.substring(2);
int index = arg.indexOf('=');
String key = index <= 0 ? arg : arg.substring(0, index);
String value = index <= 0 ? null : arg.substring(index + 1);
transInfos.put(key, value);
}
}
cleaner.initCleanerTransformations(transInfos);
long start = System.currentTimeMillis();
TagNode node;
String src = source.toLowerCase();
if (src.startsWith("http://") || src.startsWith("https://")) {
node = cleaner.clean(new URL(src), inCharset);
} else if (!source.isEmpty()) {
node = cleaner.clean(new File(source), inCharset);
} else {
node = cleaner.clean(s);
}
// if user specifies XPath expresssion to choose node for serialization, then
// try to evaluate XPath and look for first TagNode instance in the resulting array
if ( !"".equals(nodeByXPath) ) {
final Object[] xpathResult = node.evaluateXPath(nodeByXPath);
int i;
for (i = 0; i < xpathResult.length; i++) {
if ( xpathResult[i] instanceof TagNode ) {
node = (TagNode) xpathResult[i];
System.out.println("Node successfully found by XPath.");
break;
}
}
if (i == xpathResult.length) {
System.out.println("Node not found by XPath expression - whole html tree is going to be serialized!");
}
}
OutputStream out;
if ( destination == null || "".equals(destination.trim()) ) {
out = System.out;
} else {
out = new FileOutputStream(destination);
}
if ( "compact".equals(outputType) ) {
new CompactXmlSerializer(props).writeToStream(node, out, outCharset);
} else if ( "browser-compact".equals(outputType) ) {
new BrowserCompactXmlSerializer(props).writeToStream(node, out, outCharset);
} else if ( "pretty".equals(outputType) ) {
new PrettyXmlSerializer(props).writeToStream(node, out, outCharset);
} else if ( "htmlsimple".equals(outputType) ) {
new SimpleHtmlSerializer(props).writeToStream(node, out, outCharset);
} else if ( "htmlpretty".equals(outputType) ) {
new PrettyHtmlSerializer(props).writeToStream(node, out, outCharset);
} else if ( "htmlcompact".equals(outputType) ) {
new CompactHtmlSerializer(props).writeToStream(node, out, outCharset);
} else {
new SimpleXmlSerializer(props).writeToStream(node, out, outCharset);
}
if (!getSwitchArgument(args, "quiet")){
System.out.println("Finished successfully in " + (System.currentTimeMillis() - start)+ "ms." );
}
scan.close();
}
}
@@ -0,0 +1,71 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
/**
* <p>HTML comment token.</p>
*/
public class CommentNode extends BaseHtmlNode implements HtmlNode {
private String content;
public CommentNode(String content) {
this.content = content;
}
public String getCommentedContent() {
return "<!--" + content + "-->";
}
public String getContent() {
return content;
}
@Override
public String toString() {
return getCommentedContent();
}
public void serialize(Serializer serializer, Writer writer) throws IOException {
writer.write( getCommentedContent() );
}
}
@@ -0,0 +1,111 @@
/* Copyright (c) 2006-20013, HtmlCleaner project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.*;
import java.util.*;
/**
* <p>Compact HTML serializer - creates resulting HTML by stripping whitespaces wherever possible.</p>
*/
public class CompactHtmlSerializer extends HtmlSerializer {
private int openPreTags = 0;
public CompactHtmlSerializer(CleanerProperties props) {
super(props);
}
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
boolean isPreTag = "pre".equalsIgnoreCase(tagNode.getName());
if (isPreTag) {
openPreTags++;
}
serializeOpenTag(tagNode, writer, false);
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
if ( !isMinimizedTagSyntax(tagNode) ) {
ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
while ( childrenIt.hasNext() ) {
Object item = childrenIt.next();
if (item instanceof ContentNode) {
String content = item.toString();
if (openPreTags > 0) {
writer.write(content);
} else {
boolean startsWithSpace = content.length() > 0 && Character.isWhitespace( content.charAt(0) );
boolean endsWithSpace = content.length() > 1 && Character.isWhitespace( content.charAt(content.length() - 1) );
content = dontEscape(tagNode) ? content.trim() : escapeText(content.trim());
if (startsWithSpace) {
writer.write(' ');
}
if (content.length() != 0) {
writer.write(content);
if (endsWithSpace) {
writer.write(' ');
}
}
//Removed due to issue #199
//if (childrenIt.hasNext()) {
// if ( !Utils.isWhitespaceString(childrenIt.next()) ) {
// writer.write("\n");
// }
// childrenIt.previous();
//}
}
} else if (item instanceof CommentNode) {
String content = ((CommentNode) item).getCommentedContent().trim();
writer.write(content);
} else if (item instanceof BaseToken) {
((BaseToken)item).serialize(this, writer);
}
}
serializeEndTag(tagNode, writer, false);
if (isPreTag) {
openPreTags--;
}
}
}
}
@@ -0,0 +1,98 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.*;
/**
* <p>Compact XML serializer - creates resulting XML by stripping whitespaces.</p>
*/
public class CompactXmlSerializer extends XmlSerializer {
public CompactXmlSerializer(CleanerProperties props) {
super(props);
}
@Override
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializeOpenTag(tagNode, writer, false);
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
if ( !isMinimizedTagSyntax(tagNode) ) {
ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
while ( childrenIt.hasNext() ) {
Object item = childrenIt.next();
if (item != null) {
if ( item instanceof ContentNode ) {
String content = ((ContentNode) item).getContent().trim();
writer.write( dontEscape(tagNode) ? content.replaceAll("]]>", "]]&gt;") : escapeXml(content) );
if (childrenIt.hasNext()) {
if ( !isWhitespaceString(childrenIt.next()) ) {
writer.write("\n");
}
childrenIt.previous();
}
} else if (item instanceof CommentNode) {
String content = ((CommentNode) item).getCommentedContent().trim();
writer.write(content);
} else {
((BaseToken)item).serialize(this, writer);
}
}
}
serializeEndTag(tagNode, writer, false);
}
}
/**
* Checks whether specified object's string representation is empty string (containing of only whitespaces).
* @param object Object whose string representation is checked
* @return true, if empty string, false otherwise
*/
private boolean isWhitespaceString(Object object) {
if (object != null) {
String s = object.toString();
return s != null && "".equals(s.trim());
}
return false;
}
}
@@ -0,0 +1,257 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.net.URL;
/**
* Configuration file tag provider - reads XML file in specified format and creates a Tag Provider.
* Used to create custom tag providers when used on the command line.
*/
public class ConfigFileTagProvider extends HashMap implements ITagInfoProvider {
// obtaining instance of the SAX parser factory
static SAXParserFactory parserFactory = SAXParserFactory.newInstance();
static {
parserFactory.setValidating(false);
parserFactory.setNamespaceAware(false);
}
// tells whether to generate code of the tag provider class based on XML configuration file
// to the standard output
private boolean generateCode = false;
private ConfigFileTagProvider() {
}
public ConfigFileTagProvider(InputSource inputSource) {
try {
new ConfigParser(this).parse(inputSource);
} catch (Exception e) {
throw new HtmlCleanerException("Error parsing tag configuration file!", e);
}
}
public ConfigFileTagProvider(File file) {
try {
new ConfigParser(this).parse(new InputSource(new FileReader(file)));
} catch (Exception e) {
throw new HtmlCleanerException("Error parsing tag configuration file!", e);
}
}
public ConfigFileTagProvider(URL url) {
try {
Object content = url.getContent();
if (content instanceof InputStream) {
InputStreamReader reader = new InputStreamReader((InputStream)content);
new ConfigParser(this).parse(new InputSource(reader));
}
} catch (Exception e) {
throw new HtmlCleanerException("Error parsing tag configuration file!", e);
}
}
public TagInfo getTagInfo(String tagName) {
return (TagInfo) get(tagName);
}
/**
* Generates code for tag provider class from specified configuration XML file.
* In order to create custom tag info provider, make config file and call this main method
* with the specified file. Output will be generated on the standard output. This way a custom
* tag provider (class CustomTagProvider) is generated from an XML file. An example XML file,
* "example.xml", can be found in the source distribution.
*
* @param args
* @throws IOException
* @throws SAXException
* @throws ParserConfigurationException
*/
public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
final ConfigFileTagProvider provider = new ConfigFileTagProvider();
provider.generateCode = true;
String fileName = "default.xml";
if (args != null && args.length>0){
fileName = args[0];
}
File configFile = new File(fileName);
String packagePath = "org.htmlcleaner";
String className = "CustomTagProvider";
final ConfigParser parser = provider.new ConfigParser(provider);
System.out.println("package " + packagePath + ";");
System.out.println("import java.util.HashMap;");
System.out.println("public class " + className + " extends HashMap implements ITagInfoProvider {");
System.out.println("private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();");
System.out.println("// singleton instance, used if no other TagInfoProvider is specified");
System.out.println("public final static "+className+" INSTANCE= new "+className+"();");
System.out.println("public " + className + "() {");
System.out.println("TagInfo tagInfo;");
parser.parse( new InputSource(new FileReader(configFile)) );
System.out.println("}");
System.out.println("}");
}
/**
* SAX parser for tag configuration files.
*/
private class ConfigParser extends DefaultHandler {
private TagInfo tagInfo = null;
private String dependencyName = null;
private Map tagInfoMap;
ConfigParser(Map tagInfoMap) {
this.tagInfoMap = tagInfoMap;
}
public void parse(InputSource in) throws ParserConfigurationException, SAXException, IOException {
SAXParser parser = parserFactory.newSAXParser();
parser.parse(in, this);
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (tagInfo != null) {
String value = new String(ch, start, length).trim();
if ( "fatal-tags".equals(dependencyName) ) {
tagInfo.defineFatalTags(value);
if (generateCode) {
System.out.println("tagInfo.defineFatalTags(\"" + value + "\");");
}
} else if ( "req-enclosing-tags".equals(dependencyName) ) {
tagInfo.defineRequiredEnclosingTags(value);
if (generateCode) {
System.out.println("tagInfo.defineRequiredEnclosingTags(\"" + value + "\");");
}
} else if ( "forbidden-tags".equals(dependencyName) ) {
tagInfo.defineForbiddenTags(value);
if (generateCode) {
System.out.println("tagInfo.defineForbiddenTags(\"" + value + "\");");
}
} else if ( "allowed-children-tags".equals(dependencyName) ) {
tagInfo.defineAllowedChildrenTags(value);
if (generateCode) {
System.out.println("tagInfo.defineAllowedChildrenTags(\"" + value + "\");");
}
} else if ( "higher-level-tags".equals(dependencyName) ) {
tagInfo.defineHigherLevelTags(value);
if (generateCode) {
System.out.println("tagInfo.defineHigherLevelTags(\"" + value + "\");");
}
} else if ( "close-before-copy-inside-tags".equals(dependencyName) ) {
tagInfo.defineCloseBeforeCopyInsideTags(value);
if (generateCode) {
System.out.println("tagInfo.defineCloseBeforeCopyInsideTags(\"" + value + "\");");
}
} else if ( "close-inside-copy-after-tags".equals(dependencyName) ) {
tagInfo.defineCloseInsideCopyAfterTags(value);
if (generateCode) {
System.out.println("tagInfo.defineCloseInsideCopyAfterTags(\"" + value + "\");");
}
} else if ( "close-before-tags".equals(dependencyName) ) {
tagInfo.defineCloseBeforeTags(value);
if (generateCode) {
System.out.println("tagInfo.defineCloseBeforeTags(\"" + value + "\");");
}
}
}
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ( "tag".equals(qName) ) {
String name = attributes.getValue("name");
String content = attributes.getValue("content");
String section = attributes.getValue("section");
String deprecated = attributes.getValue("deprecated");
String unique = attributes.getValue("unique");
String ignorePermitted = attributes.getValue("ignore-permitted");
ContentType contentType = ContentType.toValue(content);
BelongsTo belongsTo = BelongsTo.toValue(section);
tagInfo = new TagInfo(name, contentType,
belongsTo,
deprecated != null && "true".equals(deprecated),
unique != null && "true".equals(unique),
ignorePermitted != null && "true".equals(ignorePermitted), CloseTag.required, Display.any );
if (generateCode) {
String s = "tagInfo = new TagInfo(\"#1\", #2, #3, #4, #5, #6);";
s = s.replaceAll("#1", name);
s = s.replaceAll("#2", ContentType.class.getCanonicalName()+"."+contentType.name());
s = s.replaceAll("#3", BelongsTo.class.getCanonicalName()+"."+belongsTo.name());
s = s.replaceAll("#4", Boolean.toString(deprecated != null && "true".equals(deprecated)));
s = s.replaceAll("#5", Boolean.toString(unique != null && "true".equals(unique)));
s = s.replaceAll("#6", Boolean.toString(ignorePermitted != null && "true".equals(ignorePermitted)));
System.out.println(s);
}
} else if ( !"tags".equals(qName) ) {
dependencyName = qName;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if ( "tag".equals(qName) ) {
if (tagInfo != null) {
tagInfoMap.put(tagInfo.getName(), tagInfo);
if (generateCode) {
System.out.println("this.put(\"" + tagInfo.getName() + "\", tagInfo);\n");
}
}
tagInfo = null;
} else if ( !"tags".equals(qName) ) {
dependencyName = null;
}
}
}
}
@@ -0,0 +1,72 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
/**
* <p>HTML text token.</p>
*/
public class ContentNode extends BaseHtmlNode implements HtmlNode {
protected final String content;
protected final boolean blank;
public ContentNode(String content) {
this.content = content;
this.blank = Utils.isEmptyString(this.content);
}
public String getContent() {
return content;
}
@Override
public String toString() {
return getContent();
}
public void serialize(Serializer serializer, Writer writer) throws IOException {
writer.write( getContent() );
}
public boolean isBlank() {
return this.blank;
}
}
@@ -0,0 +1,76 @@
/*
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* @author patmoore
*
*/
public enum ContentType {
all("all"),
/**
* elements that have no children or content ( for example <img> ). For these elements, the check for null elements must be more than must a children/ content check.
*/
none("none"),
text("text");
private final String dbCode;
private ContentType(String dbCode) {
this.dbCode =dbCode;
}
/**
* @return the dbCode
*/
public String getDbCode() {
return dbCode;
}
public static ContentType toValue(Object value) {
ContentType result = null;
if ( value instanceof ContentType) {
result = (ContentType) value;
} else if ( value != null ) {
String dbCode = value.toString().trim();
for(ContentType contentType: ContentType.values()) {
if ( contentType.getDbCode().equalsIgnoreCase(dbCode) || contentType.name().equalsIgnoreCase(dbCode)) {
result = contentType;
break;
}
}
}
return result;
}
}
@@ -0,0 +1,645 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
/**
* This is the default tag provider for HTML Cleaner
* Note this is no longer generated from XML - see https://sourceforge.net/p/htmlcleaner/bugs/81/
*/
public class DefaultTagProvider implements ITagInfoProvider {
private static final String STRONG = "strong";
private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
// singleton instance, used if no other TagInfoProvider is specified
public final static DefaultTagProvider INSTANCE= new DefaultTagProvider();
private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo,"+STRONG+",em,q,b,i,u,tt,sub,sup,big,small,strike,s,font";
private static final String CLOSE_BEFORE_TAGS = "h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
/**
* Phrasing tags are those that can make up paragraphs along with text to make Phrasing Content
*/
private static final String PHRASING_TAGS = "a,abbr,area,audio,b,bdi,bdo,br,button,canvas,cite,code,data,datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,keygen,label,link,map,mark,math,meta,meter,noscript,object,output,progress,q,ruby,s,samp,script,select,small,span,strong,sub,sup,svg,template,textarea,time,u,var,video,wbr";
/**
* HTML5 Media Tags
*/
private static final String MEDIA_TAGS = "audio,video";
public DefaultTagProvider() {
TagInfo tagInfo;
// private static final Set<String> END_TAG_OPTIONAL = Collections.unmodifiableSet(new HashSet(Arrays.asList(
// "thead", "dt", "body", "tr", "colgroup", "td", "tfoot", "th", "li", "dd", "tbody", "p", "html", "head", "option")));
// private static final Set<String> END_TAG_FORBIDDEN = Collections.unmodifiableSet(new HashSet(Arrays.asList(
// "hr", "col", "param", "link", "img", "br", "meta", "input", "frame", "area", "basefont", "base", "isindex")));
// private static final Set<String> END_TAG_REQUIRED = Collections.unmodifiableSet(new HashSet(Arrays.asList(
// "noscript", "kbd", "center", "button", "h5", "h4", "samp", "ol", "h6", "h1", "h3", "h2", "form", "select",
// "font", "menu", "ins",
// "abbr", "label", "table", "code", "script", "cite", "iframe", "strong", "textarea", "noframes", "big",
// "small", "span", "sub", "optgroup", "bdo", "var", "div", "object", "sup", "title", "strike", "style",
// "dir", "map", "applet", "dl", "del", "fieldset", "ul", "b", "acronym", "a", "blockquote",
// "caption", "i", "u", "s", "frameset", "tt", "address", "q", "pre", "legend", "em", "dfn")));
tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("div", tagInfo);
/**
* The HTML5 semantic flow tags
*/
// Sectioning tags
tagInfo = new TagInfo("aside", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("aside", tagInfo);
tagInfo = new TagInfo("section", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("section", tagInfo);
tagInfo = new TagInfo("article", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("article", tagInfo);
tagInfo = new TagInfo("main", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("main", tagInfo);
tagInfo = new TagInfo("nav", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("nav", tagInfo);
tagInfo = new TagInfo("details", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("details", tagInfo);
tagInfo = new TagInfo("summary", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineRequiredEnclosingTags("details");
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("summary", tagInfo);
tagInfo = new TagInfo("figure", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p");
this.put("figure", tagInfo);
tagInfo = new TagInfo("figcaption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineRequiredEnclosingTags("figure");
this.put("figcaption", tagInfo);
// header and footer
tagInfo = new TagInfo("header", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,header,footer,main");
this.put("header", tagInfo);
tagInfo = new TagInfo("footer", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,header,footer,main");
this.put("footer", tagInfo);
/**
* Html5 phrasing tags
*/
tagInfo = new TagInfo("mark", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("mark", tagInfo);
tagInfo = new TagInfo("bdi", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("bdi", tagInfo);
tagInfo = new TagInfo("time", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("time", tagInfo);
tagInfo = new TagInfo("meter", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
tagInfo.defineCloseBeforeTags("meter");
this.put("meter", tagInfo);
/**
* Html5 Ruby text
*/
tagInfo = new TagInfo("ruby", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags("rt,rp");
this.put("ruby", tagInfo);
tagInfo = new TagInfo("rt", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.inline);
//
// If we include this rule, we get an out-of-memory error. See issue 126.
//
//tagInfo.defineRequiredEnclosingTags("ruby");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("rt", tagInfo);
tagInfo = new TagInfo("rp", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.inline);
//
// If we include this rule, we get an out-of-memory error. See issue 126.
//
//tagInfo.defineRequiredEnclosingTags("ruby");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("rp", tagInfo);
/**
* Html5 media tags
*/
tagInfo = new TagInfo("audio", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
this.put("audio", tagInfo);
tagInfo = new TagInfo("video", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
this.put("video", tagInfo);
tagInfo = new TagInfo("source", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.any);
tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
this.put("source", tagInfo);
tagInfo = new TagInfo("track", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.any);
tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
this.put("track", tagInfo);
tagInfo = new TagInfo("canvas", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("canvas", tagInfo);
/**
* Html5 interactive tags
*/
tagInfo = new TagInfo("dialog", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("dialog", tagInfo);
tagInfo = new TagInfo("progress", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
tagInfo.defineCloseBeforeTags("progress");
this.put("progress", tagInfo);
/**
* HTML 4 and earlier tags
*/
tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("span", tagInfo);
tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("meta", tagInfo);
tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("link", tagInfo);
tagInfo = new TagInfo("title", ContentType.text, BelongsTo.HEAD, false, true, false, CloseTag.required, Display.none);
this.put("title", tagInfo);
tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false, false, false, CloseTag.required, Display.none);
this.put("style", tagInfo);
tagInfo = new TagInfo("bgsound", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("bgsound", tagInfo);
tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h1", tagInfo);
tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h2", tagInfo);
tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h3", tagInfo);
tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h4", tagInfo);
tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h5", tagInfo);
tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h6", tagInfo);
// jericho parser requires <p></p>
tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("p", tagInfo);
tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put(STRONG, tagInfo);
tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("em", tagInfo);
tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("abbr", tagInfo);
tagInfo = new TagInfo("acronym", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("acronym", tagInfo);
tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("address", tagInfo);
tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("bdo", tagInfo);
tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("blockquote", tagInfo);
tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("cite", tagInfo);
tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("q", tagInfo);
tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("code", tagInfo);
tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("ins", tagInfo);
tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("del", tagInfo);
tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("dfn", tagInfo);
tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("kbd", tagInfo);
tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("pre", tagInfo);
tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("samp", tagInfo);
tagInfo = new TagInfo("listing", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("listing", tagInfo);
tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("var", tagInfo);
tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
this.put("br", tagInfo);
tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
this.put("wbr", tagInfo);
tagInfo = new TagInfo("nobr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("nobr");
this.put("nobr", tagInfo);
tagInfo = new TagInfo("xmp", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("xmp", tagInfo);
tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("a");
this.put("a", tagInfo);
tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("base", tagInfo);
tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
this.put("img", tagInfo);
tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
tagInfo.defineFatalTags("map");
tagInfo.defineCloseBeforeTags("area");
this.put("area", tagInfo);
tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags("map");
this.put("map", tagInfo);
tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("object", tagInfo);
tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("param", tagInfo);
tagInfo = new TagInfo("applet", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.any);
this.put("applet", tagInfo);
tagInfo = new TagInfo("xml", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
this.put("xml", tagInfo);
tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("ul", tagInfo);
tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("ol", tagInfo);
tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("li", tagInfo);
tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("dl", tagInfo);
tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags("dt,dd");
this.put("dt", tagInfo);
tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags("dt,dd");
this.put("dd", tagInfo);
tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("menu", tagInfo);
tagInfo = new TagInfo("dir", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("dir", tagInfo);
tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,colgroup,caption");
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("table", tagInfo);
tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tbody");
tagInfo.defineAllowedChildrenTags("td,th");
tagInfo.defineHigherLevelTags("thead,tfoot");
tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
this.put("tr", tagInfo);
// jericho parser requires <td></td>
tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tr");
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
this.put("td", tagInfo);
tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tr");
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
this.put("th", tagInfo);
tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("tbody", tagInfo);
tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("thead", tagInfo);
tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("tfoot", tagInfo);
tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
tagInfo.defineFatalTags("colgroup");
this.put("col", tagInfo);
tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("col");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("colgroup", tagInfo);
tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineFatalTags("table");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("caption", tagInfo);
tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.block);
tagInfo.defineForbiddenTags("form");
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("form", tagInfo);
tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("input", tagInfo);
tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("textarea", tagInfo);
tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags("option,optgroup");
tagInfo.defineCloseBeforeTags("option,optgroup,select");
this.put("select", tagInfo);
tagInfo = new TagInfo("option", ContentType.text, BelongsTo.BODY, false, false, true, CloseTag.optional, Display.inline);
tagInfo.defineFatalTags("select");
tagInfo.defineCloseBeforeTags("option");
this.put("option", tagInfo);
tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
tagInfo.defineFatalTags("select");
tagInfo.defineAllowedChildrenTags("option");
tagInfo.defineCloseBeforeTags("optgroup");
this.put("optgroup", tagInfo);
tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("button", tagInfo);
tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("label", tagInfo);
tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
//
// If we include this rule, we get an out-of-memory error. See issue 129.
//
//tagInfo.defineRequiredEnclosingTags("fieldset");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("legend", tagInfo);
tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("fieldset", tagInfo);
tagInfo = new TagInfo("isindex", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("isindex", tagInfo);
tagInfo = new TagInfo("script", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.none);
this.put("script", tagInfo);
tagInfo = new TagInfo("noscript", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.block);
this.put("noscript", tagInfo);
tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("u,i,tt,sub,sup,big,small,strike,blink,s");
this.put("b", tagInfo);
tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,tt,sub,sup,big,small,strike,blink,s");
this.put("i", tagInfo);
tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,i,tt,sub,sup,big,small,strike,blink,s");
this.put("u", tagInfo);
tagInfo = new TagInfo("tt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,big,small,strike,blink,s");
this.put("tt", tagInfo);
tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sup,big,small,strike,blink,s");
this.put("sub", tagInfo);
tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,big,small,strike,blink,s");
this.put("sup", tagInfo);
tagInfo = new TagInfo("big", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,small,strike,blink,s");
this.put("big", tagInfo);
tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,strike,blink,s");
this.put("small", tagInfo);
tagInfo = new TagInfo("strike", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,blink,s");
this.put("strike", tagInfo);
tagInfo = new TagInfo("blink", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,s");
this.put("blink", tagInfo);
tagInfo = new TagInfo("marquee", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("marquee", tagInfo);
tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,blink");
this.put("s", tagInfo);
tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("hr", tagInfo);
tagInfo = new TagInfo("font", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
this.put("font", tagInfo);
tagInfo = new TagInfo("basefont", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.none);
this.put("basefont", tagInfo);
tagInfo = new TagInfo("center", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("center", tagInfo);
tagInfo = new TagInfo("comment", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
this.put("comment", tagInfo);
tagInfo = new TagInfo("server", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
this.put("server", tagInfo);
tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("iframe", tagInfo);
tagInfo = new TagInfo("embed", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("embed", tagInfo);
}
/**
* @param key
* @param tagInfo
*/
protected void put(String tagName, TagInfo tagInfo) {
this.tagInfoMap.put(tagName, tagInfo);
}
public TagInfo getTagInfo(String tagName) {
if ( tagName == null) {
// null named tagNode happens when a html fragment is being dealt with
return null;
} else {
return this.tagInfoMap.get(tagName);
}
}
}
@@ -0,0 +1,62 @@
package org.htmlcleaner;
/**
* Most HTML 4 elements permitted within the BODY are classified as either
* block-level elements or inline elements. This enumeration contains
* corresponding constants to distinguish them.
*
* @author Konstantin Burov (aectann@gmail.com)
*
*/
public enum Display {
/**
* Block-level elements typically contain inline elements and other
* block-level elements. When rendered visually, block-level elements
* usually begin on a new line.
*/
block(true, false),
/**
* Inline elements typically may only contain text and other inline
* elements. When rendered visually, inline elements do not usually begin on
* a new line.
*/
inline(false, true),
/**
* The following elements may be used as either block-level elements or
* inline elements. If used as inline elements (e.g., within another inline
* element or a P), these elements should not contain any block-level
* elements.
*/
any(true, false),
/**
* Elements that are not actually inline or block, usually such elements are
* not rendered at all.
*/
none(true, false);
private boolean afterTagLineBreakNeeded;
private boolean leadingAndEndWhitespacesAllowed;
private Display(boolean afterTagLineBreakNeeded, boolean leadingAndEndWhitespacesAllowed) {
this.afterTagLineBreakNeeded = afterTagLineBreakNeeded;
this.leadingAndEndWhitespacesAllowed = leadingAndEndWhitespacesAllowed;
}
/**
* @return true to advise serializers to put line break after tags with such a display type.
*/
public boolean isAfterTagLineBreakNeeded() {
return afterTagLineBreakNeeded;
}
/**
* @return true if tag contents can have single leading or end whitespace
*/
public boolean isLeadingAndEndWhitespacesAllowed() {
return leadingAndEndWhitespacesAllowed;
}
}
@@ -0,0 +1,389 @@
/* Copyright (c) 2006-2013, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
/**
* <p>HTML doctype token.</p>
*/
public class DoctypeToken extends BaseHtmlNode implements HtmlNode{
//
// Part 1 is the document type, typically 'html' or 'HTML'
//
private String part1;
//
// Part 2 is the PUBLIC or SYSTEM token
//
private String part2;
//
// Part 3 is the PUBLIC identifier, typically '-//W3C//DTD HTML 4.01//EN' or similar
//
private String part3;
//
// Part 4 is the SYSTEM identifier, typically a URL for the DTD
//
private String part4;
/**
* The identified DocType, if any
*/
private Integer type = null;
//
// Constants for identified doctypes
//
public static final int UNKNOWN = 0;
public static final int HTML4_0 = 10;
public static final int HTML4_01 = 20;
public static final int HTML4_01_STRICT = 21;
public static final int HTML4_01_TRANSITIONAL = 22;
public static final int HTML4_01_FRAMESET = 23;
public static final int XHTML1_0_STRICT = 31;
public static final int XHTML1_0_TRANSITIONAL = 32;
public static final int XHTML1_0_FRAMESET = 33;
public static final int XHTML1_1 = 40;
public static final int XHTML1_1_BASIC = 41;
public static final int HTML5 = 60;
public static final int HTML5_LEGACY_TOOL_COMPATIBLE = 61;
//
// Whether the DocType is valid
//
private Boolean valid = null;
public DoctypeToken(String part1, String part2, String part3, String part4) {
this.part1 = part1;
this.part2 = part2 != null ? part2.toUpperCase() : part2;
this.part3 = clean(part3);
this.part4 = clean(part4);
validate();
}
/*
* Constructor for 5-part DocTypes, e.g. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" SYSTEM "http://www.w3.org/TR/html4/strict.dtd">.
* For this we ignore part4 as we assume that must be "SYSTEM".
*/
public DoctypeToken(String part1, String part2, String part3, String part4, String part5) {
this.part1 = part1;
this.part2 = part2 != null ? part2.toUpperCase() : part2;
this.part3 = clean(part3);
this.part4 = clean(part5);
validate();
}
private String clean(String s) {
if (s != null) {
s = s.replace('>', ' ');
s = s.replace('<', ' ');
s = s.replace('&', ' ');
s = s.replace('\'', ' ');
s = s.replace('\"', ' ');
}
return s;
}
public boolean isValid(){
return valid;
}
/**
* Checks the doctype according to W3C parsing rules and tries to identify
* the type and validity
*
* See:
* <ul>
* <li>http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax</li>
* <li>http://dev.w3.org/html5/html-author/#doctype-declaration</li>
* </ul>
*/
private void validate() {
//
// No PUBLIC or SYSTEM token
//
if (!"public".equalsIgnoreCase(part2) && !"system".equalsIgnoreCase(part2)) {
//
// HTML 5
//
if ("html".equalsIgnoreCase(part1) && (part2 == null)){
type = HTML5;
valid = true;
}
}
if ("public".equalsIgnoreCase(part2)){
//
// HTML 4.0 is valid without an ID, or with strict DTD ID
//
if ("-//W3C//DTD HTML 4.0//EN".equals(getPublicId())){
type = HTML4_0;
if ("http://www.w3.org/TR/REC-html40/strict.dtd".equals(part4) || "".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// HTML 4.0.1 STRICT is valid with Strict dtd ID or empty
//
if ("-//W3C//DTD HTML 4.01//EN".equals(getPublicId())){
type = HTML4_01_STRICT;
if ("http://www.w3.org/TR/html4/strict.dtd".equals(part4) || "".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// HTML 4.0.1 TRANSITIONAL valid only with Transitional DTD ID
//
if ("-//W3C//DTD HTML 4.01 Transitional//EN".equals(getPublicId())){
type = HTML4_01_TRANSITIONAL;
if ("http://www.w3.org/TR/html4/loose.dtd".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// HTML 4.0.1 FRAMESET valid only with Frameset ID
//
if ("-//W3C//DTD HTML 4.01 Frameset//EN".equals(getPublicId())){
type = HTML4_01_FRAMESET;
if ("http://www.w3.org/TR/html4/frameset.dtd".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// XHTML 1.0
//
if ("-//W3C//DTD XHTML 1.0 Strict//EN".equals(getPublicId())){
type = XHTML1_0_STRICT;
if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// XHTML 1.0 Transitional
//
if ("-//W3C//DTD XHTML 1.0 Transitional//EN".equals(getPublicId())){
type = XHTML1_0_TRANSITIONAL;
if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// XHTML 1.0 Frameset
//
if ("-//W3C//DTD XHTML 1.0 Frameset//EN".equals(getPublicId())){
type = XHTML1_0_FRAMESET;
if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// XHTML 1.1
//
if ("-//W3C//DTD XHTML 1.1//EN".equals(getPublicId())){
type = XHTML1_1;
if ("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
//
// XHTML 1.1 Basic
//
if ("-//W3C//DTD XHTML Basic 1.1//EN".equals(getPublicId())){
type = XHTML1_1_BASIC;
if ("http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd".equals(getSystemId())){
valid = true;
} else {
valid = false;
}
}
}
if ("system".equalsIgnoreCase(part2)){
//
// HTML 5 legacy tool compatible
//
if ("about:legacy-compat".equals(getPublicId())){
type = HTML5_LEGACY_TOOL_COMPATIBLE;
valid = true;
}
}
if (type == null){
type = UNKNOWN;
valid = false;
}
}
public String getContent() {
if (type == UNKNOWN && part1 == null){
return "<!DOCTYPE>";
}
String result = "<!DOCTYPE ";
//
// If the type is XHTML or HTML5, the output is "html", otherwise it should be "HTML"
//
if (type != UNKNOWN){
if (type >= 30){
result += "html";
} else {
result += "HTML";
}
} else {
//
// if its an unknown doctype, just pass through as-is.
//
result += part1 ;
}
if (part2 != null){
result += " " + part2 + " \"" + part3 + "\"";
if (!"".equals(part4) ) {
result += " \"" + part4 + "\"";
}
}
result += ">";
return result;
}
@Override
public String toString() {
return getContent();
}
/**
* This will retrieve an integer representing the identified DocType
*/
public int getType(){
return type;
}
public String getName() {
return "";
}
public void serialize(Serializer serializer, Writer writer) throws IOException {
writer.write(getContent() + "\n");
}
/**
* This will retrieve the public ID of an externally referenced DTD, or an empty String if none is referenced.
*/
public String getPublicId(){
return part3;
}
/**
* This will retrieve the system ID of an externally referenced DTD, or an empty String if none is referenced.
*/
public String getSystemId(){
return part4;
}
public String getPart1() {
return part1;
}
public String getPart2() {
return part2;
}
/**
* Deprecated - use getPublicId() instead
* @return the third part of the DOCSTRING
*/
@Deprecated
public String getPart3() {
return part3;
}
/**
* Deprecated - use getSystemId() instead
* @return the fourth part of the DOCSTRING
*/
@Deprecated
public String getPart4() {
return part4;
}
}
@@ -0,0 +1,275 @@
package org.htmlcleaner;
import java.util.Iterator;
import java.util.Map;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
public class DomBuilder implements XmlVisitor{
private Document document;
private Element destinationElement;
private CleanerProperties props;
protected boolean escapeXml = true;
protected boolean deserializeCdataEntities = false;
protected boolean strictErrorChecking = true;
private static final String CSS_COMMENT_START = "/*";
public DomBuilder(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
this.props = props;
this.escapeXml = escapeXml;
this.deserializeCdataEntities = deserializeCdataEntities;
this.strictErrorChecking = strictErrorChecking;
}
public Document getDocument(){
return this.document;
}
private boolean shouldEscapeOrTranslateEntities() {
return escapeXml || props.isRecognizeUnicodeChars() || props.isTranslateSpecialEntities();
}
public void head(HtmlNode node, int depth) {
//
// For script and style nodes, check if we're set to use CDATA
//
CDATASection cdata = null;
if (node instanceof TagNode && props.isUseCdataFor(((TagNode)node).getName())){
cdata = document.createCDATASection("");
destinationElement.appendChild(document.createTextNode(CSS_COMMENT_START));
destinationElement.appendChild(cdata);
}
if (node instanceof CommentNode) {
CommentNode commentNode = (CommentNode) node;
Comment comment = document.createComment( commentNode.getContent() );
destinationElement.appendChild(comment);
} else if (node instanceof ContentNode) {
ContentNode contentNode = (ContentNode) node;
String content = contentNode.getContent();
boolean specialCase = props.isUseCdataFor(node.getParent().getName());
if (shouldEscapeOrTranslateEntities() && !specialCase) {
content = Utils.escapeXml(content, props, true);
}
if (specialCase && node instanceof CData){
//
// For CDATA sections we don't want to return the start and
// end tokens. See issue #106.
//
content = ((CData)node).getContentWithoutStartAndEndTokens();
}
if (specialCase && deserializeCdataEntities){
content = this.deserializeCdataEntities(content);
}
if (cdata != null){
cdata.appendData(content);
} else {
destinationElement.appendChild(document.createTextNode(content) );
}
} else if (node instanceof TagNode) {
TagNode subTagNode = (TagNode) node;
//
// XML element names are more strict in their definition
// than HTML tag identifiers.
// See https://www.w3.org/TR/xml/#NT-Name
// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
//
String name = Utils.sanitizeXmlIdentifier(subTagNode.getName(), props.getInvalidXmlAttributeNamePrefix());
//
// If the element name is completely invalid, treat it as text
//
if (name == null){
ContentNode contentNode = new ContentNode(subTagNode.getName() + subTagNode.getText().toString());
String content = contentNode.getContent();
content = Utils.escapeXml(content, props, true);
destinationElement.appendChild(document.createTextNode(content) );
} else {
if (document == null){
try {
document = this.createDocument(subTagNode);
} catch (ParserConfigurationException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Element element = document.createElement( name );
//
// Create attributes
//
Map<String, String> attributes = subTagNode.getAttributes();
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
while (entryIterator.hasNext()) {
Map.Entry<String, String> entry = entryIterator.next();
String attrName = entry.getKey();
String attrValue = entry.getValue();
if (escapeXml) {
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
attrValue = Utils.escapeXml(attrValue, props, true);
}
//
// Fix any invalid attribute names by adding a prefix
//
if (!props.isAllowInvalidAttributeNames()){
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
}
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
element.setAttribute(attrName, attrValue);
//
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
//
if (attrName.equalsIgnoreCase("id")) {
element.setIdAttribute(attrName, true);
}
}
}
if (destinationElement == null){
destinationElement = document.getDocumentElement();
} else {
destinationElement.appendChild(element);
destinationElement = element;
}
//
// Hack for now, we need a better way to do this in future
//
for (Object token: subTagNode.getAllChildren()){
if (token instanceof ContentNode){
((ContentNode)token).setParent(subTagNode);
}
}
}
}
}
protected String deserializeCdataEntities(String input){
return Utils.deserializeEntities(input, props.isRecognizeUnicodeChars());
}
public void tail(HtmlNode node, int depth) {
if (node instanceof TagNode && destinationElement.getParentNode() instanceof Element) {
destinationElement = (Element) destinationElement.getParentNode();
}
}
//
// Allow overriding of serialization for implementations. See bug #167.
//
protected Document createDocument(TagNode rootNode) throws ParserConfigurationException{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
DOMImplementation impl = builder.getDOMImplementation();
Document document;
//
// Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
//
// Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
// common patterns for errors with the older HTML4 doctypes.
//
if (rootNode.getDocType() != null){
String qualifiedName = rootNode.getDocType().getPart1();
String publicId = rootNode.getDocType().getPublicId();
String systemId = rootNode.getDocType().getSystemId();
//
// If there is no qualified name, set it to html. See bug #153.
//
if (qualifiedName == null) qualifiedName = "html";
DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);
//
// While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html". See bug #116
//
if (qualifiedName.equals("HTML")) qualifiedName = "html";
document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
} else {
document = builder.newDocument();
Element rootElement = document.createElement(rootNode.getName());
document.appendChild(rootElement);
}
//
// Turn off error checking if we're allowing invalid attribute names, or if we've chosen to turn it off
//
if (props.isAllowInvalidAttributeNames() || strictErrorChecking == false){
document.setStrictErrorChecking(false);
}
//
// Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
//
Map<String, String> attributes = rootNode.getAttributes();
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
while (entryIterator.hasNext()) {
Map.Entry<String, String> entry = entryIterator.next();
String attrName = entry.getKey();
String attrValue = entry.getValue();
//
// Fix any invalid attribute names
//
if (!props.isAllowInvalidAttributeNames()){
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
}
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
if (escapeXml) {
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
attrValue = Utils.escapeXml(attrValue, props, true);
}
document.getDocumentElement().setAttribute(attrName, attrValue);
//
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
//
if (attrName.equalsIgnoreCase("id")) {
document.getDocumentElement().setIdAttribute(attrName, true);
}
}
}
return document;
}
}
@@ -0,0 +1,410 @@
/* Copyright (c) 2006-2019, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.w3c.dom.Element;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
/**
* <p>DOM serializer - creates xml DOM.</p>
*/
public class DomSerializer {
private static final String CSS_COMMENT_START = "/*";
private static final String CSS_COMMENT_END = "*/";
private static final String NEW_LINE = "\n";
private static final String XML_10 = "1.0";
private static final String XML_11 = "1.1";
/**
* The HTML Cleaner properties set by the user to control the HTML cleaning.
*/
protected CleanerProperties props;
/**
* Whether XML entities should be escaped or not.
*/
protected boolean escapeXml = true;
protected boolean deserializeCdataEntities = false;
protected boolean strictErrorChecking = true;
protected String xmlVersion = XML_10;
public String getXmlVersion() {
return xmlVersion;
}
public void setXmlVersion(String xmlVersion) throws Exception {
if (xmlVersion == XML_10 || xmlVersion == XML_11) {
this.xmlVersion = xmlVersion;
} else {
throw new Exception("Invalid XML version - must be 1.0 or 1.1");
}
}
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
* @param escapeXml if true then escape XML entities
* @param deserializeCdataEntities if true then deserialize entities in CData sections
* @param strictErrorChecking if false then Document strict error checking is turned off
*/
public DomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
this.props = props;
this.escapeXml = escapeXml;
this.deserializeCdataEntities = deserializeCdataEntities;
this.strictErrorChecking = strictErrorChecking;
}
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
* @param escapeXml if true then escape XML entities
* @param deserializeCdataEntities if true then deserialize entities in CData sections
*/
public DomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities) {
this.props = props;
this.escapeXml = escapeXml;
this.deserializeCdataEntities = deserializeCdataEntities;
}
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
* @param escapeXml if true then escape XML entities
*/
public DomSerializer(CleanerProperties props, boolean escapeXml) {
this.props = props;
this.escapeXml = escapeXml;
}
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
*/
public DomSerializer(CleanerProperties props) {
this(props, true);
}
//
// Allow overriding of serialization for implementations. See bug #167.
//
protected Document createDocument(TagNode rootNode) throws ParserConfigurationException{
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
DOMImplementation impl = builder.getDOMImplementation();
Document document;
//
// Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
//
// Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
// common patterns for errors with the older HTML4 doctypes.
//
if (rootNode.getDocType() != null){
String qualifiedName = rootNode.getDocType().getPart1();
String publicId = rootNode.getDocType().getPublicId();
String systemId = rootNode.getDocType().getSystemId();
//
// If there is no qualified name, set it to html. See bug #153.
//
if (qualifiedName == null) qualifiedName = "html";
DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);
//
// While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html". See bug #116
//
if (qualifiedName.equals("HTML")) qualifiedName = "html";
document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
document.setXmlVersion(xmlVersion);
} else {
document = builder.newDocument();
document.setXmlVersion(xmlVersion);
Element rootElement = document.createElement(rootNode.getName());
document.appendChild(rootElement);
}
//
// Turn off error checking if we're allowing invalid attribute names, or if we've chosen to turn it off
//
if (props.isAllowInvalidAttributeNames() || strictErrorChecking == false){
document.setStrictErrorChecking(false);
}
//
// Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
//
Map<String, String> attributes = rootNode.getAttributes();
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
while (entryIterator.hasNext()) {
Map.Entry<String, String> entry = entryIterator.next();
String attrName = entry.getKey();
String attrValue = entry.getValue();
//
// Fix any invalid attribute names
//
if (!props.isAllowInvalidAttributeNames()){
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
}
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
if (escapeXml) {
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
attrValue = Utils.escapeXml(attrValue, props, true);
}
document.getDocumentElement().setAttribute(attrName, attrValue);
//
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
//
if (attrName.equalsIgnoreCase("id")) {
document.getDocumentElement().setIdAttribute(attrName, true);
}
}
}
return document;
}
/**
* @param rootNode the HTML Cleaner root node to serialize
* @return the W3C Document object
* @throws ParserConfigurationException if there's an error during serialization
*/
public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
Document document = createDocument(rootNode);
createSubnodes(document, (Element)document.getDocumentElement(), rootNode.getAllChildren());
return document;
}
/**
* @param element the element to check
* @return true if the passed element is a script or style element
*/
protected boolean isScriptOrStyle(Element element) {
String tagName = element.getNodeName();
return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
}
/**
* encapsulate content with <[CDATA[ ]]> for things like script and style elements
* @param element
* @return true if <[CDATA[ ]]> should be used.
*/
protected boolean dontEscape(Element element) {
// make sure <script src=..></script> doesn't get turned into <script src=..><[CDATA[]]></script>
return props.isUseCdataFor(element.getNodeName()) && (!element.hasChildNodes() || element.getTextContent() == null || element.getTextContent().trim().length() == 0);
}
protected String outputCData(CData cdata){
return cdata.getContentWithoutStartAndEndTokens();
}
protected String deserializeCdataEntities(String input){
return Utils.deserializeEntities(input, props.isRecognizeUnicodeChars());
}
/**
* Serialize a given HTML Cleaner node.
*
* @param document the W3C Document to use for creating new DOM elements
* @param element the W3C element to which we'll add the subnodes to
* @param tagChildren the HTML Cleaner nodes to serialize for that node
*/
protected void createSubnodes(Document document, Element element, List<? extends BaseToken> tagChildren) {
if (tagChildren != null) {
CDATASection cdata = null;
//
// For script and style nodes, check if we're set to use CDATA
//
if (props.isUseCdataFor(element.getTagName())){
cdata = document.createCDATASection("");
element.appendChild(document.createTextNode(CSS_COMMENT_START));
element.appendChild(cdata);
}
Iterator<? extends BaseToken> it = tagChildren.iterator();
while (it.hasNext()) {
Object item = it.next();
if (item instanceof CommentNode) {
CommentNode commentNode = (CommentNode) item;
Comment comment = document.createComment( commentNode.getContent() );
element.appendChild(comment);
} else if (item instanceof ContentNode) {
ContentNode contentNode = (ContentNode) item;
String content = contentNode.getContent();
boolean specialCase = props.isUseCdataFor(element.getTagName());
if (props.isRecognizeUnicodeChars() && props.isTranslateSpecialEntities()) {
content = Utils.deserializeEntities(content, props.isRecognizeUnicodeChars());
}
if ((escapeXml || props.isTranslateSpecialEntities()) && !specialCase) {
content = Utils.escapeXml(content, props, true);
}
if (specialCase && item instanceof CData){
//
// For CDATA sections we don't want to return the start and
// end tokens. See issue #106.
//
content = ((CData)item).getContentWithoutStartAndEndTokens();
}
if (specialCase && deserializeCdataEntities){
content = this.deserializeCdataEntities(content);
}
if (cdata != null){
cdata.appendData(content);
} else {
element.appendChild(document.createTextNode(content) );
}
} else if (item instanceof TagNode) {
TagNode subTagNode = (TagNode) item;
//
// XML element names are more strict in their definition
// than HTML tag identifiers.
// See https://www.w3.org/TR/xml/#NT-Name
// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
//
String name = Utils.sanitizeXmlIdentifier(subTagNode.getName(), props.getInvalidXmlAttributeNamePrefix());
//
// If the element name is completely invalid, treat it as text
//
if (name == null){
ContentNode contentNode = new ContentNode(subTagNode.getName() + subTagNode.getText().toString());
String content = contentNode.getContent();
content = Utils.escapeXml(content, props, true);
element.appendChild(document.createTextNode(content) );
} else {
Element subelement = document.createElement( name );
Map<String, String> attributes = subTagNode.getAttributes();
Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
while (entryIterator.hasNext()) {
Map.Entry<String, String> entry = entryIterator.next();
String attrName = entry.getKey();
String attrValue = entry.getValue();
if (escapeXml) {
attrValue = Utils.deserializeEntities(attrValue, true);
attrValue = Utils.escapeXml(attrValue, props, true);
}
//
// Fix any invalid attribute names by adding a prefix
//
if (!props.isAllowInvalidAttributeNames()){
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
}
if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
subelement.setAttribute(attrName, attrValue);
//
// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
//
if (attrName.equalsIgnoreCase("id")) {
subelement.setIdAttribute(attrName, true);
}
}
}
// recursively create subnodes
createSubnodes(document, subelement, subTagNode.getAllChildren());
element.appendChild(subelement);
}
} else if (item instanceof List) {
List<? extends BaseToken> sublist = (List<? extends BaseToken>) item;
createSubnodes(document, element, sublist);
}
}
if (cdata != null){
if (!cdata.getData().startsWith(NEW_LINE)){
cdata.setData(CSS_COMMENT_END + NEW_LINE + cdata.getData());
} else {
cdata.setData(CSS_COMMENT_END + cdata.getData());
}
if (!cdata.getData().endsWith(NEW_LINE)){
cdata.appendData(NEW_LINE);
}
cdata.appendData(CSS_COMMENT_START);
element.appendChild(document.createTextNode(CSS_COMMENT_END));
}
}
}
}
@@ -0,0 +1,69 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.Writer;
/**
* <p>HTML tag end token.</p>
*/
public class EndTagToken extends TagToken {
public EndTagToken() {
}
public EndTagToken(String name) {
super(name == null ? null : name);
}
@Override
void addAttribute(String attName, String attValue) {
// do nothing - simply ignore attributes in closing tag
}
public void serialize(Serializer serializer, Writer writer) {
// do nothing - simply ignore serialization
}
@Override
public String toString() {
return "endtoken" + super.toString();
}
}
@@ -0,0 +1,535 @@
/* Copyright (c) 2006-2015, Philokypros Ioulianou
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Philokypros Ioulianou by sending e-mail to
philokypro_s@hotmail.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
public class Html4TagProvider implements ITagInfoProvider {
private static final String STRONG = "strong";
private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
// singleton instance, used if no other TagInfoProvider is specified
public final static Html4TagProvider INSTANCE= new Html4TagProvider();
private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo,"+STRONG+",em,q,b,i,u,tt,sub,sup,big,small,strike,s,font";
private static final String CLOSE_BEFORE_TAGS = "p,details,summary,menuitem,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
/**
* Phrasing tags are those that can make up paragraphs along with text to make Phrasing Content
*/
private static final String PHRASING_TAGS = "a,abbr,area,b,bdi,bdo,br,button,canvas,cite,code,command,data,datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,keygen,label,link,map,mark,math,meta,meter,noscript,object,output,progress,q,s,samp,script,select,small,span,strong,sub,sup,svg,template,text,textarea,time,u,var,wbr";
public Html4TagProvider() {
TagInfo tagInfo=null;
basicElements(tagInfo);
formattingElements(tagInfo);
formElements(tagInfo);
imgElements(tagInfo);
listElements(tagInfo);
linkElements(tagInfo);
tableElements(tagInfo);
styleElements(tagInfo);
olderElements(tagInfo);
scriptElements(tagInfo);
}
public void basicElements(TagInfo tagInfo){
tagInfo = new TagInfo("title", ContentType.text, BelongsTo.HEAD, false, true, false, CloseTag.required, Display.none);
this.put("title", tagInfo);
tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h1", tagInfo);
tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h2", tagInfo);
tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h3", tagInfo);
tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h4", tagInfo);
tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h5", tagInfo);
tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("h6", tagInfo);
// jericho parser requires <p></p>
tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("p", tagInfo);
tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
this.put("br", tagInfo);
tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("hr", tagInfo);
tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("div", tagInfo);
}
public void formattingElements(TagInfo tagInfo){
tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("abbr", tagInfo);
tagInfo = new TagInfo("acronym", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("acronym", tagInfo);
tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("address", tagInfo);
tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("u,i,tt,sub,sup,big,small,strike,blink,s");
this.put("b", tagInfo);
tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("bdo", tagInfo);
tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("blockquote", tagInfo);
tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("cite", tagInfo);
tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("q", tagInfo);
tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("code", tagInfo);
tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("ins", tagInfo);
tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,tt,sub,sup,big,small,strike,blink,s");
this.put("i", tagInfo);
tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,i,tt,sub,sup,big,small,strike,blink,s");
this.put("u", tagInfo);
tagInfo = new TagInfo("tt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,big,small,strike,blink,s");
this.put("tt", tagInfo);
tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sup,big,small,strike,blink,s");
this.put("sub", tagInfo);
tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,big,small,strike,blink,s");
this.put("sup", tagInfo);
tagInfo = new TagInfo("big", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,small,strike,blink,s");
this.put("big", tagInfo);
tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,strike,blink,s");
this.put("small", tagInfo);
tagInfo = new TagInfo("strike", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,blink,s");
this.put("strike", tagInfo);
tagInfo = new TagInfo("blink", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,s");
this.put("blink", tagInfo);
tagInfo = new TagInfo("marquee", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("marquee", tagInfo);
tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,blink");
this.put("s", tagInfo);
tagInfo = new TagInfo("font", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
this.put("font", tagInfo);
tagInfo = new TagInfo("basefont", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.none);
this.put("basefont", tagInfo);
tagInfo = new TagInfo("center", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("center", tagInfo);
tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("del", tagInfo);
tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("dfn", tagInfo);
tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("kbd", tagInfo);
tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("pre", tagInfo);
tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("samp", tagInfo);
tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put(STRONG, tagInfo);
tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("em", tagInfo);
tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("var", tagInfo);
tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
this.put("wbr", tagInfo);
}
public void formElements(TagInfo tagInfo){
tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.block);
tagInfo.defineForbiddenTags("form");
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("form", tagInfo);
tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("input", tagInfo);
tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("textarea", tagInfo);
tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags("option,optgroup");
tagInfo.defineCloseBeforeTags("option,optgroup,select");
this.put("select", tagInfo);
tagInfo = new TagInfo("option", ContentType.text, BelongsTo.BODY, false, false, true, CloseTag.optional, Display.inline);
tagInfo.defineFatalTags("select");
tagInfo.defineCloseBeforeTags("option");
this.put("option", tagInfo);
tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
tagInfo.defineFatalTags("select");
tagInfo.defineAllowedChildrenTags("option");
tagInfo.defineCloseBeforeTags("optgroup");
this.put("optgroup", tagInfo);
tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("button", tagInfo);
tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("label", tagInfo);
tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
//
// If we include this rule, we get an out-of-memory error. See issue 129.
//
//tagInfo.defineRequiredEnclosingTags("fieldset");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("legend", tagInfo);
tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("fieldset", tagInfo);
}
public void listElements(TagInfo tagInfo){
tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("ul", tagInfo);
tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("ol", tagInfo);
tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("li", tagInfo);
tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("dl", tagInfo);
tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags("dt,dd");
this.put("dt", tagInfo);
tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags("dt,dd");
this.put("dd", tagInfo);
tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("menu", tagInfo);
tagInfo = new TagInfo("dir", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("dir", tagInfo);
}
public void linkElements(TagInfo tagInfo){
tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("link", tagInfo);
tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("a");
this.put("a", tagInfo);
}
public void tableElements(TagInfo tagInfo){
tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,colgroup,caption");
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("table", tagInfo);
tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tbody");
tagInfo.defineAllowedChildrenTags("td,th");
tagInfo.defineHigherLevelTags("thead,tfoot");
tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
this.put("tr", tagInfo);
// jericho parser requires <td></td>
tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tr");
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
this.put("td", tagInfo);
tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tr");
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
this.put("th", tagInfo);
tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("tbody", tagInfo);
tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("thead", tagInfo);
tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("tfoot", tagInfo);
tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
tagInfo.defineFatalTags("colgroup");
this.put("col", tagInfo);
tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("col");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("colgroup", tagInfo);
tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineFatalTags("table");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("caption", tagInfo);
}
public void styleElements(TagInfo tagInfo){
tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("span", tagInfo);
tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false, false, false, CloseTag.required, Display.none);
this.put("style", tagInfo);
tagInfo = new TagInfo("bgsound", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("bgsound", tagInfo);
tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("meta", tagInfo);
tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
this.put("base", tagInfo);
}
public void scriptElements(TagInfo tagInfo){
tagInfo = new TagInfo("script", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.none);
this.put("script", tagInfo);
tagInfo = new TagInfo("noscript", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.block);
this.put("noscript", tagInfo);
tagInfo = new TagInfo("applet", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.any);
this.put("applet", tagInfo);
tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("object", tagInfo);
tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("param", tagInfo);
}
public void imgElements(TagInfo tagInfo){
tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
this.put("img", tagInfo);
tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
tagInfo.defineFatalTags("map");
tagInfo.defineCloseBeforeTags("area");
this.put("area", tagInfo);
tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags("map");
this.put("map", tagInfo);
}
public void olderElements(TagInfo tagInfo){
tagInfo = new TagInfo("listing", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("listing", tagInfo);
tagInfo = new TagInfo("nobr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("nobr");
this.put("nobr", tagInfo);
tagInfo = new TagInfo("xmp", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
this.put("xmp", tagInfo);
tagInfo = new TagInfo("xml", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
this.put("xml", tagInfo);
tagInfo = new TagInfo("isindex", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("isindex", tagInfo);
tagInfo = new TagInfo("comment", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
this.put("comment", tagInfo);
tagInfo = new TagInfo("server", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
this.put("server", tagInfo);
tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
this.put("iframe", tagInfo);
}
protected void put(String tagName, TagInfo tagInfo) {
this.tagInfoMap.put(tagName, tagInfo);
}
public TagInfo getTagInfo(String tagName) {
if ( tagName == null) {
// null named tagNode happens when a html fragment is being dealt with
return null;
} else {
return this.tagInfoMap.get(tagName.toLowerCase());
}
}
}
@@ -0,0 +1,885 @@
/* Copyright (c) 2006-2017, Philokypros Ioulianou and the HTMLCleaner team
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Philokypros Ioulianou by sending e-mail to
philokypro_s@hotmail.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
public class Html5TagProvider implements ITagInfoProvider {
private static final String STRONG = "strong";
private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
// singleton instance, used if no other TagInfoProvider is specified
public final static Html5TagProvider INSTANCE = new Html5TagProvider();
public MathMLTagProvider INSTANCE2;
private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo," + STRONG
+ ",em,q,b,i,sub,sup,small,s";
private static final String CLOSE_BEFORE_TAGS = "p,summary,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
// private static final String CLOSE_BEFORE_TAGS =
// "h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
/**
* Phrasing tags are those that can make up paragraphs along with text to
* make Phrasing Content. Generally speaking, phrasing content only allows phrasing content as child tags.
*/
private static final String PHRASING_TAGS = "a,abbr,area,audio,b,bdi,bdo,br,button,canvas,cite,code,command,datalist,del,dfn,em,i,input,ins,kbd,keygen,label,link,map,mark,meta,meter,noscript,output,progress,p,ruby,samp,s,script,select,small,span,strong,sub,sup,svg,template,textarea,time,u,var,wbr";
/**
* Most elements that are used in the body of documents and applications are categorized as flow content.
*/
private static final String FLOW_TAGS = "a,abbr,address,area,article,aside,audio,b,bdi,bdo,blockquote,br,button,canvas,cite,code,data,datalist,del,dfn,div,dl,em,embed,fieldset,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hr,i,iframe,img,input,ins,kbd,keygen,label,main,map,mark,math,meter,nav,noscript,object,ol,output,p,pre,progress,q,ruby,s,samp,script,section,select,small,span,strong,sub,sup,svg,table,template,textarea,time,u,ul,var,video,wbr,text";
/**
* HTML5 Media Tags
*/
private static final String MEDIA_TAGS = "audio,video,object,source";
private static final String SCRIPT_SUPPORTING_TAGS = "script,template";
public Html5TagProvider() {
TagInfo tagInfo = null;
embeddedContentTags(tagInfo);
semanticFlowTags(tagInfo);
interactiveTags(tagInfo);
groupingTags(tagInfo);
phrasingTags(tagInfo);
mediaTags(tagInfo);
editTags(tagInfo);
formTags(tagInfo);
tableTags(tagInfo);
metadataTags(tagInfo);
scriptingTags(tagInfo);
//INSTANCE2 = new MathMLTagProvider(tagInfo, tagInfoMap);
}
public void embeddedContentTags(TagInfo tagInfo) {
// SVG
tagInfo = new TagInfo("svg", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineAllowedChildrenTags("animate,animateMotion,animateTransform,discard,set,desc,title,metadata,linearGradient,radialGradient,pattern,circle,ellipse,line,path,polygon,polyline,rect,defs,g,svg,symbol,use,a,audio,canvas,clipPath,filter,foreignObject,iframe,image,marker,mask,script,style,switch,text,video,view");
tagInfo.setAssumedNamespace("http://www.w3.org/2000/svg");
tagInfo.setAssumedNamespacePrefix("svg");
this.put("svg", tagInfo);
// MathML
tagInfo = new TagInfo("math", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("math,summary,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
//tagInfo.defineForbiddenTags("math");
//
// We'll add this later - right now it causes more problems than it solves
// as there are no tag name clashes between MathML and HTML unlike in SVG.
//
tagInfo.setAssumedNamespace("http://www.w3.org/1998/Math/MathML");
tagInfo.setAssumedNamespacePrefix("mathml");
//
this.put("math", tagInfo);
}
/**
* The HTML5 semantic flow tags-Sectioning tags (15 total)
*
*/
public void semanticFlowTags(TagInfo tagInfo) {
tagInfo = new TagInfo("section", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("section", tagInfo);
tagInfo = new TagInfo("nav", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("nav", tagInfo);
tagInfo = new TagInfo("article", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineForbiddenTags("menu");
this.put("article", tagInfo);
tagInfo = new TagInfo("aside", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineForbiddenTags("menu");
tagInfo.defineForbiddenTags("address");
this.put("aside", tagInfo);
tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
this.put("h1", tagInfo);
tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
this.put("h2", tagInfo);
tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
this.put("h3", tagInfo);
tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
this.put("h4", tagInfo);
tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
this.put("h5", tagInfo);
tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
this.put("h6", tagInfo);
tagInfo = new TagInfo("hgroup", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineAllowedChildrenTags("h1,h2,h3,h4,h5,h6");
this.put("hgroup", tagInfo);
// header and footer
tagInfo = new TagInfo("header", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineForbiddenTags("menu,header,footer");
this.put("header", tagInfo);
tagInfo = new TagInfo("footer", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineForbiddenTags("menu,header,footer");
this.put("footer", tagInfo);
tagInfo = new TagInfo("main", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("main", tagInfo);
tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineForbiddenTags("address");
this.put("address", tagInfo);
}
/**
* The HTML5 Interactive tags (4 total)
*/
public void interactiveTags(TagInfo tagInfo) {
tagInfo = new TagInfo("details", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("details", tagInfo);
tagInfo = new TagInfo("summary", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineRequiredEnclosingTags("details");
tagInfo.defineForbiddenTags("summary");
this.put("summary", tagInfo);
tagInfo = new TagInfo("command", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineForbiddenTags("command");
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("command", tagInfo);
tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineAllowedChildrenTags("menuitem,li");
this.put("menu", tagInfo);
tagInfo = new TagInfo("menuitem", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineRequiredEnclosingTags("menu");
this.put("menuitem", tagInfo);
tagInfo = new TagInfo("dialog", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("dialog", tagInfo);
}
/**
* The HTML5 grouping tags (14 total)
*/
public void groupingTags(TagInfo tagInfo) {
tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("div", tagInfo);
tagInfo = new TagInfo("figure", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("figure", tagInfo);
tagInfo = new TagInfo("figcaption", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.any);
tagInfo.defineRequiredEnclosingTags("figure");
this.put("figcaption", tagInfo);
tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,summary,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml,time");
this.put("p", tagInfo);
tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("pre", tagInfo);
tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("dl,"+CLOSE_BEFORE_TAGS);
//
// This is not correct, but is how most browsers seem to handle
// lists. Strictly, only an LI can be a child of a UL or OL
//
tagInfo.defineAllowedChildrenTags("li,ul,ol,div");
//
// Where we do have invalid children, we try to insert a LI to make it valid
// rather than move out the content.
//
tagInfo.setPreferredChildTag("li");
this.put("ul", tagInfo);
tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("dl,"+CLOSE_BEFORE_TAGS);
//
// This is not correct, but is how most browsers seem to handle
// lists. Strictly, only an LI can be a child of a UL or OL
//
tagInfo.defineAllowedChildrenTags("li,ul,ol,div");
//
// Where we do have invalid children, we try to insert a LI to make it valid
// rather than move out the content.
//
tagInfo.setPreferredChildTag("li");
this.put("ol", tagInfo);
tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("li," + CLOSE_BEFORE_TAGS);
tagInfo.defineRequiredEnclosingTags("ol,menu,ul");
this.put("li", tagInfo);
tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineAllowedChildrenTags("dt,dd,div,"+SCRIPT_SUPPORTING_TAGS);
tagInfo.setPreferredChildTag("div");
this.put("dl", tagInfo);
tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags("dt,dd");
tagInfo.defineAllowedChildrenTags(FLOW_TAGS);
tagInfo.defineRequiredEnclosingTags("dl");
this.put("dt", tagInfo);
tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags("dt,dd");
tagInfo.defineAllowedChildrenTags(FLOW_TAGS);
tagInfo.defineRequiredEnclosingTags("dl");
this.put("dd", tagInfo);
tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("hr", tagInfo);
tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("blockquote", tagInfo);
}
/**
* Html5 phrasing tags --text level semantics (31 total) thelw data
*/
public void phrasingTags(TagInfo tagInfo) {
tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("em", tagInfo);
tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put(STRONG, tagInfo);
tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,blink,s");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("small", tagInfo);
tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,small,blink");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("s", tagInfo);
tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("a");
this.put("a", tagInfo);
tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.none);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("wbr", tagInfo);
tagInfo = new TagInfo("mark", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("mark", tagInfo);
tagInfo = new TagInfo("bdi", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("bdi", tagInfo);
tagInfo = new TagInfo("time", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("time", tagInfo);
tagInfo = new TagInfo("data", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("data", tagInfo);
tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("cite", tagInfo);
tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("q", tagInfo);
tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("code", tagInfo);
tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
this.put("span", tagInfo);
tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("bdo", tagInfo);
tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("dfn", tagInfo);
tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("kbd", tagInfo);
tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("abbr", tagInfo);
tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("var", tagInfo);
tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("samp", tagInfo);
tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.none);
this.put("br", tagInfo);
tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sup,small,blink,s");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("sub", tagInfo);
tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,small,blink,s");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("sup", tagInfo);
tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("u,i,sub,sup,small,blink,s");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("b", tagInfo);
tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,u,sub,sup,small,blink,s");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("i", tagInfo);
tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true,
false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseInsideCopyAfterTags("b,i,sub,sup,small,blink,s");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("u", tagInfo);
// ---->Html5 Ruby text (added rb,rtc)
tagInfo = new TagInfo("ruby", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags("rt,rp,rb,rtc");
this.put("ruby", tagInfo);
tagInfo = new TagInfo("rtc", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.inline);
tagInfo.defineRequiredEnclosingTags("ruby");
tagInfo.defineAllowedChildrenTags("rt,"+PHRASING_TAGS);
this.put("rtc", tagInfo);
tagInfo = new TagInfo("rb", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.inline);
tagInfo.defineRequiredEnclosingTags("ruby");
this.put("rb", tagInfo);
tagInfo = new TagInfo("rt", ContentType.text, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.inline);
tagInfo.defineRequiredEnclosingTags("ruby");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("rt", tagInfo);
tagInfo = new TagInfo("rp", ContentType.text, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.inline);
tagInfo.defineRequiredEnclosingTags("ruby");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("rp", tagInfo);
}
/**
* Html5 media-embedded tags (12 tags)
*/
public void mediaTags(TagInfo tagInfo) {
tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.inline);
this.put("img", tagInfo);
tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
this.put("iframe", tagInfo);
tagInfo = new TagInfo("embed", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("embed", tagInfo);
tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
this.put("object", tagInfo);
tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.none);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineRequiredEnclosingTags("object");
this.put("param", tagInfo);
tagInfo = new TagInfo("audio", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
this.put("audio", tagInfo);
tagInfo = new TagInfo("picture", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.any);
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
this.put("picture", tagInfo);
tagInfo = new TagInfo("video", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
this.put("video", tagInfo);
tagInfo = new TagInfo("source", ContentType.none, BelongsTo.BODY,
false, false, false, CloseTag.forbidden, Display.any);
tagInfo.defineRequiredEnclosingTags("audio,video,object");
this.put("source", tagInfo);
tagInfo = new TagInfo("track", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.any);
tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
this.put("track", tagInfo);
tagInfo = new TagInfo("canvas", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
this.put("canvas", tagInfo);
tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.none);
tagInfo.defineFatalTags("map");
tagInfo.defineCloseBeforeTags("area");
this.put("area", tagInfo);
tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags("map");
tagInfo.defineAllowedChildrenTags("area");
this.put("map", tagInfo);
}
/**
* The HTML5 edits tags (2 total)
*/
public void editTags(TagInfo tagInfo) {
tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
this.put("ins", tagInfo);
tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
this.put("del", tagInfo);
}
/**
* The HTML5 table tags (12 total)
*/
public void tableTags(TagInfo tagInfo) {
tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,col,colgroup,caption");
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("table", tagInfo);
tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tbody");
tagInfo.defineAllowedChildrenTags("td,th");
//
// Where we do have invalid children, we try to insert a TD to make it valid
// rather than move out the content.
//
tagInfo.setPreferredChildTag("td");
tagInfo.defineHigherLevelTags("thead,tfoot");
tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
this.put("tr", tagInfo);
// jericho parser requires <td></td>
tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tr");
tagInfo.defineHigherLevelTags("tr");
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
this.put("td", tagInfo);
tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineRequiredEnclosingTags("tr");
tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
this.put("th", tagInfo);
tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("tbody", tagInfo);
tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("thead", tagInfo);
tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("tr,form");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("tfoot", tagInfo);
tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.block);
tagInfo.defineFatalTags("colgroup");
this.put("col", tagInfo);
tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.optional, Display.block);
tagInfo.defineFatalTags("table");
tagInfo.defineAllowedChildrenTags("col");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("colgroup", tagInfo);
tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.inline);
tagInfo.defineFatalTags("table");
tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
this.put("caption", tagInfo);
}
/**
* The HTML5 forms tags (15 total)
*
*/
public void formTags(TagInfo tagInfo) {
tagInfo = new TagInfo("meter", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
tagInfo.defineCloseBeforeTags("meter");
this.put("meter", tagInfo);
tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false,
false, true, CloseTag.required, Display.block);
tagInfo.defineForbiddenTags("form");
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("form", tagInfo);
tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.inline);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("input", tagInfo);
tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("textarea", tagInfo);
tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false,
false, true, CloseTag.required, Display.inline);
tagInfo.defineAllowedChildrenTags("option,optgroup");
tagInfo.defineCloseBeforeTags("option,optgroup,select");
this.put("select", tagInfo);
tagInfo = new TagInfo("option", ContentType.text, BelongsTo.BODY,
false, false, true, CloseTag.optional, Display.inline);
tagInfo.defineFatalTags("select,datalist");
tagInfo.defineCloseBeforeTags("option");
this.put("option", tagInfo);
tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY,
false, false, true, CloseTag.required, Display.inline);
tagInfo.defineFatalTags("select");
tagInfo.defineAllowedChildrenTags("option");
tagInfo.defineCloseBeforeTags("optgroup");
this.put("optgroup", tagInfo);
tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags("select,optgroup,option");
this.put("button", tagInfo);
tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.inline);
this.put("label", tagInfo);
tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.block);
tagInfo.defineRequiredEnclosingTags("fieldset");
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
this.put("legend", tagInfo);
tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
this.put("fieldset", tagInfo);
tagInfo = new TagInfo("progress", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.any);
tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
tagInfo.defineCloseBeforeTags("progress");
this.put("progress", tagInfo);
tagInfo = new TagInfo("datalist", ContentType.all, BelongsTo.BODY,
false, false, false, CloseTag.required, Display.any);
tagInfo.defineAllowedChildrenTags("option");
tagInfo.defineCloseBeforeTags("datalist");
this.put("datalist", tagInfo);
tagInfo = new TagInfo("keygen", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.forbidden, Display.any);
this.put("keygen", tagInfo);
tagInfo = new TagInfo("output", ContentType.all, BelongsTo.BODY, false,
false, false, CloseTag.required, Display.any);
tagInfo.defineCloseBeforeTags("output," + CLOSE_BEFORE_TAGS);
this.put("output", tagInfo);
}
/**
* HTML5 Document metadata tags
*/
public void metadataTags(TagInfo tagInfo) {
// As of HTML5, meta can be used in <body> where it has a @name attribute
// TODO add attribute rules
tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD_AND_BODY, false,
false, false, CloseTag.forbidden, Display.none);
this.put("meta", tagInfo);
// As of HTML5, link can be used in <body> where it has an @itemprop attribute
// TODO add attribute rules
tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD_AND_BODY, false,
false, false, CloseTag.forbidden, Display.none);
this.put("link", tagInfo);
tagInfo = new TagInfo("title", ContentType.text, BelongsTo.HEAD, false,
true, false, CloseTag.required, Display.none);
this.put("title", tagInfo);
// Current specification: style can only be used in <head>
tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false,
false, false, CloseTag.required, Display.none);
this.put("style", tagInfo);
tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false,
false, false, CloseTag.forbidden, Display.none);
this.put("base", tagInfo);
}
/**
* HTML5 scripting tags
*/
public void scriptingTags(TagInfo tagInfo) {
tagInfo = new TagInfo("script", ContentType.all,
BelongsTo.HEAD_AND_BODY, false, false, false,
CloseTag.required, Display.none);
this.put("script", tagInfo);
tagInfo = new TagInfo("noscript", ContentType.all,
BelongsTo.HEAD_AND_BODY, false, false, false,
CloseTag.required, Display.block);
this.put("noscript", tagInfo);
}
/**
* It inserts the tag node into the tagInfoMap.
*
* @param tagName
* The name of the tag
* @param tagInfo
* The info about tag node
*/
protected void put(String tagName, TagInfo tagInfo) {
this.tagInfoMap.put(tagName, tagInfo);
}
/**
* It returns the tag information.
*
* @param tagName
* The name of the tag to return
* @return TagInfo The information about tag node
*/
public TagInfo getTagInfo(String tagName) {
if (tagName == null) {
// null named tagNode happens when a html fragment is being dealt
// with
return null;
} else {
return this.tagInfoMap.get(tagName.toLowerCase());
}
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,62 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* <p>General HtmlCleaner runtime exception.</p>
*/
public class HtmlCleanerException extends RuntimeException {
public HtmlCleanerException() {
this("HtmlCleaner expression occureed!");
}
public HtmlCleanerException(Throwable cause) {
super(cause);
}
public HtmlCleanerException(String message) {
super(message);
}
public HtmlCleanerException(String message, Throwable cause) {
super(message, cause);
}
}
@@ -0,0 +1,354 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import org.apache.tools.ant.BuildException;
import org.apache.tools.ant.Task;
import java.net.URL;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.FileOutputStream;
import java.io.ByteArrayOutputStream;
import java.util.Map;
import java.util.TreeMap;
/**
* <p>Support for ANT.</p>
*/
public class HtmlCleanerForAnt extends Task {
private String text;
private String src;
private String dest;
private String incharset = CleanerProperties.DEFAULT_CHARSET;
private String outcharset = CleanerProperties.DEFAULT_CHARSET;
private String taginfofile = null;
private String outputtype = "simple";
private boolean advancedxmlescape = true;
private boolean usecdata = true;
private String usecdatafor = "script,style";
private boolean specialentities = true;
private boolean unicodechars = true;
private boolean omitunknowntags = false;
private boolean treatunknowntagsascontent = false;
private boolean omitdeprtags = false;
private boolean treatdeprtagsascontent = false;
private boolean omitcomments = false;
private boolean omitxmldecl = false;
private boolean omitdoctypedecl = true;
private boolean omithtmlenvelope = false;
private boolean useemptyelementtags = true;
private boolean allowmultiwordattributes = true;
private boolean allowhtmlinsideattributes = false;
private boolean ignoreqe = false;
private boolean namespacesaware = true;
private String hyphenreplacement = "=";
private String prunetags = "";
private String booleanatts = CleanerProperties.BOOL_ATT_SELF;
private String nodebyxpath = null;
private String transform = null;
private boolean allowInvalidAttributeNames = false;
private String invalidAttributeNamePrefix = "";
public void setText(String text) {
this.text = text;
}
public void setSrc(String src) {
this.src = src;
}
public void setDest(String dest) {
this.dest = dest;
}
public void setIncharset(String incharset) {
this.incharset = incharset;
}
public void setOutcharset(String outcharset) {
this.outcharset = outcharset;
}
public void setTaginfofile(String taginfofile) {
this.taginfofile = taginfofile;
}
public void setOutputtype(String outputtype) {
this.outputtype = outputtype;
}
public void setAdvancedxmlescape(boolean advancedxmlescape) {
this.advancedxmlescape = advancedxmlescape;
}
public void setUsecdata(boolean usecdata) {
this.usecdata = usecdata;
}
public void setUsecdatafor(String usecdatafor) {
this.usecdatafor = usecdatafor;
}
public void setSpecialentities(boolean specialentities) {
this.specialentities = specialentities;
}
public void setUnicodechars(boolean unicodechars) {
this.unicodechars = unicodechars;
}
public void setOmitunknowntags(boolean omitunknowntags) {
this.omitunknowntags = omitunknowntags;
}
public void setTreatunknowntagsascontent(boolean treatunknowntagsascontent) {
this.treatunknowntagsascontent = treatunknowntagsascontent;
}
public void setOmitdeprtags(boolean omitdeprtags) {
this.omitdeprtags = omitdeprtags;
}
public void setTreatdeprtagsascontent(boolean treatdeprtagsascontent) {
this.treatdeprtagsascontent = treatdeprtagsascontent;
}
public void setOmitcomments(boolean omitcomments) {
this.omitcomments = omitcomments;
}
public void setOmitxmldecl(boolean omitxmldecl) {
this.omitxmldecl = omitxmldecl;
}
public void setOmitdoctypedecl(boolean omitdoctypedecl) {
this.omitdoctypedecl = omitdoctypedecl;
}
public void setOmithtmlenvelope(boolean omithtmlenvelope) {
this.omithtmlenvelope = omithtmlenvelope;
}
public void setUseemptyelementtags(boolean useemptyelementtags) {
this.useemptyelementtags = useemptyelementtags;
}
public void setAllowmultiwordattributes(boolean allowmultiwordattributes) {
this.allowmultiwordattributes = allowmultiwordattributes;
}
public void setAllowhtmlinsideattributes(boolean allowhtmlinsideattributes) {
this.allowhtmlinsideattributes = allowhtmlinsideattributes;
}
public void setIgnoreqe(boolean ignoreqe) {
this.ignoreqe = ignoreqe;
}
public void setNamespacesaware(boolean namespacesaware) {
this.namespacesaware = namespacesaware;
}
public void setHyphenreplacement(String hyphenreplacement) {
this.hyphenreplacement = hyphenreplacement;
}
public void setPrunetags(String prunetags) {
this.prunetags = prunetags;
}
public void setBooleanatts(String booleanatts) {
this.booleanatts = booleanatts;
}
public void setNodebyxpath(String nodebyxpath) {
this.nodebyxpath = nodebyxpath;
}
public void setTransform(String transform) {
this.transform = transform;
}
public void addText(String text) {
this.text = text;
}
/**
* Implementation of Ant task execution.
* @throws BuildException
*/
@Override
public void execute() throws BuildException {
HtmlCleaner cleaner;
if ( this.taginfofile != null ) {
cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(this.taginfofile)));
} else {
cleaner = new HtmlCleaner();
}
if (text == null && src == null) {
throw new BuildException("Eather attribute 'src' or text body containing HTML must be specified!");
}
CleanerProperties props = cleaner.getProperties();
props.setAdvancedXmlEscape(this.advancedxmlescape);
props.setUseCdataFor(this.usecdatafor);
props.setUseCdataForScriptAndStyle(this.usecdata);
props.setTranslateSpecialEntities(this.specialentities);
props.setRecognizeUnicodeChars(this.unicodechars);
props.setOmitUnknownTags(this.omitunknowntags);
props.setTreatUnknownTagsAsContent(this.treatunknowntagsascontent);
props.setOmitDeprecatedTags(this.omitdeprtags);
props.setTreatDeprecatedTagsAsContent(this.treatdeprtagsascontent);
props.setOmitComments(this.omitcomments);
props.setOmitXmlDeclaration(this.omitxmldecl);
props.setOmitDoctypeDeclaration(this.omitdoctypedecl);
props.setOmitHtmlEnvelope(this.omithtmlenvelope);
props.setUseEmptyElementTags(this.useemptyelementtags);
props.setAllowMultiWordAttributes(this.allowmultiwordattributes);
props.setAllowHtmlInsideAttributes(this.allowhtmlinsideattributes);
props.setIgnoreQuestAndExclam(this.ignoreqe);
props.setNamespacesAware(this.namespacesaware);
props.setHyphenReplacementInComment(this.hyphenreplacement);
props.setPruneTags(this.prunetags);
props.setBooleanAttributeValues(this.booleanatts);
props.setAllowInvalidAttributeNames(this.allowInvalidAttributeNames);
props.setInvalidXmlAttributeNamePrefix(this.invalidAttributeNamePrefix);
// set cleaner transformation if specified in "transform" attribute
// format of attribute is expected to be <transkey1>[=<transvalue1>]|<transkey2>[=<transvalue2>...
// (separator is pipe character)
if ( !Utils.isEmptyString(transform) ) {
String[] transItems = Utils.tokenize(transform, "|");
Map transInfos = new TreeMap();
for (String item : transItems) {
int index = item.indexOf('=');
String key = index <= 0 ? item : item.substring(0, index);
String value = index <= 0 ? null : item.substring(index + 1);
transInfos.put(key, value);
}
cleaner.initCleanerTransformations(transInfos);
}
try {
TagNode node;
try {
if ( src != null && (src.startsWith("http://") || src.startsWith("https://")) ) {
node = cleaner.clean(new URL(src), incharset);
} else if (src != null) {
node = cleaner.clean(new File(src), incharset);
} else {
node = cleaner.clean(text);
}
} catch (IOException e) {
throw new BuildException(e);
}
// if user specifies XPath expresssion to choose node for serialization, then
// try to evaluate XPath and look for first TagNode instance in the resulting array
if ( nodebyxpath != null ) {
final Object[] xpathResult = node.evaluateXPath(nodebyxpath);
for (Object element : xpathResult) {
if ( element instanceof TagNode ) {
node = (TagNode) element;
break;
}
}
}
OutputStream out;
String antPropertyName = "";
if ( dest == null || "".equals(dest.trim()) ) {
out = System.out;
} else if ( dest.startsWith("property:") ) {
out = new ByteArrayOutputStream();
antPropertyName = dest.substring(dest.indexOf(':') + 1);
getProject().log("Setting property " + antPropertyName);
} else {
out = new FileOutputStream(dest);
}
if ( "compact".equals(outputtype) ) {
new CompactXmlSerializer(props).writeToStream(node, out, outcharset);
} else if ( "browser-compact".equals(outputtype) ) {
new BrowserCompactXmlSerializer(props).writeToStream(node, out, outcharset);
} else if ( "pretty".equals(outputtype) ) {
new PrettyXmlSerializer(props).writeToStream(node, out, outcharset);
} else {
new SimpleXmlSerializer(props).writeToStream(node, out, outcharset);
}
if ( antPropertyName != null && antPropertyName.length() > 0 ) {
getProject().setNewProperty(antPropertyName, out.toString());
}
} catch (IOException e) {
throw new BuildException(e);
} catch (XPatherException e) {
throw new BuildException(e);
}
}
public boolean isAllowInvalidAttributeNames() {
return allowInvalidAttributeNames;
}
public void setAllowInvalidAttributeNames(boolean allowInvalidAttributeNames) {
this.allowInvalidAttributeNames = allowInvalidAttributeNames;
}
public String getInvalidAttributeNamePrefix() {
return invalidAttributeNamePrefix;
}
public void setInvalidAttributeNamePrefix(String invalidAttributeNamePrefix) {
this.invalidAttributeNamePrefix = invalidAttributeNamePrefix;
}
}
@@ -0,0 +1,16 @@
package org.htmlcleaner;
import java.util.List;
/**
* Marker interface denoting nodes of the document tree
*/
public interface HtmlNode extends BaseToken {
public List<? extends BaseToken> getSiblings();
public TagNode getParent();
public void setParent(TagNode parent);
}
@@ -0,0 +1,141 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.*;
import java.util.*;
/**
* <p>Abstract HTML serializer - contains common logic for descendants.</p>
*/
public abstract class HtmlSerializer extends Serializer {
protected HtmlSerializer(CleanerProperties props) {
super(props);
}
protected boolean isMinimizedTagSyntax(TagNode tagNode) {
final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
return tagInfo != null && !tagNode.hasChildren() && tagInfo.isEmptyTag();
}
protected boolean dontEscape(TagNode tagNode) {
return isScriptOrStyle(tagNode);
}
protected String escapeText(String content) {
return Utils.escapeHtml(content, props);
}
protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
String tagName = tagNode.getName();
if (Utils.isEmptyString(tagName)) {
return;
}
boolean nsAware = props.isNamespacesAware();
if (!nsAware && Utils.getXmlNSPrefix(tagName) != null ) {
tagName = Utils.getXmlName(tagName);
}
writer.write("<" + tagName);
for (Map.Entry<String, String> entry: tagNode.getAttributes().entrySet()) {
String attName = entry.getKey();
String attValue = entry.getValue();
attValue = Utils.deserializeEntities(attValue, props.isRecognizeUnicodeChars());
//
// Note that because we implemented the WHATWG attribute identifier rules
// during the tokenize stage, we'll never have invalid attribute names at
// this point.
//
if (attName != null){
if (!nsAware && Utils.getXmlNSPrefix(attName) != null ) {
attName = Utils.getXmlName(attName);
}
if (!(nsAware && attName.equalsIgnoreCase("xmlns")))
writer.write(" " + attName + "=\"" + escapeText(attValue) + "\"");
}
}
if (nsAware) {
Map<String, String> nsDeclarations = tagNode.getNamespaceDeclarations();
if (nsDeclarations != null) {
for (Map.Entry<String, String> entry: nsDeclarations.entrySet()) {
String prefix = entry.getKey();
String att = "xmlns";
if (prefix.length() > 0) {
att += ":" + prefix;
}
writer.write(" " + att + "=\"" + escapeText(entry.getValue()) + "\"");
}
}
}
if ( isMinimizedTagSyntax(tagNode) ) {
writer.write(" />");
if (newLine) {
writer.write("\n");
}
} else {
writer.write(">");
}
}
protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
String tagName = tagNode.getName();
if (Utils.isEmptyString(tagName)) {
return;
}
if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) {
tagName = Utils.getXmlName(tagName);
}
writer.write( "</" + tagName + ">" );
if (newLine) {
writer.write("\n");
}
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,52 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* <p>
* Provides set of TagInfo instances. The instance of this interface is used as a
* collection of tag definitions used in cleanup process. Implementing this interface
* desired behaviour of cleaner can be achived.<br/>
* In most cases implementation will be or contain a kind of Map.
* </p>
*/
public interface ITagInfoProvider {
public TagInfo getTagInfo(String tagName);
}
@@ -0,0 +1,254 @@
package org.htmlcleaner;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.jdom2.CDATA;
import org.jdom2.Comment;
import org.jdom2.DefaultJDOMFactory;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.jdom2.Text;
/**
* <p>
* JDom serializer - creates xml JDom instance out of the TagNode.
* </p>
*/
public class JDomSerializer {
private static final String CSS_COMMENT_START = "/*";
private static final String CSS_COMMENT_END = "*/";
private static final String NEW_LINE = "\n";
private DefaultJDOMFactory factory;
protected CleanerProperties props;
protected boolean escapeXml = true;
public JDomSerializer(CleanerProperties props, boolean escapeXml) {
this.props = props;
this.escapeXml = escapeXml;
}
public JDomSerializer(CleanerProperties props) {
this(props, true);
}
public Document createJDom(TagNode rootNode) {
this.factory = new DefaultJDOMFactory();
//
// If there is no actual root node then return nothing
//
if (rootNode.getName() == null) return null;
Element rootElement = createElement(rootNode);
Document document = this.factory.document(rootElement);
setAttributes(rootNode, rootElement);
createSubnodes(rootElement, rootNode.getAllChildren());
return document;
}
private Element createElement(TagNode node) {
String name = node.getName();
//
// XML element names are more strict in their definition
// than HTML tag identifiers.
// See https://www.w3.org/TR/xml/#NT-Name
// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
//
name = Utils.sanitizeXmlIdentifier(name);
boolean nsAware = props.isNamespacesAware();
String prefix = Utils.getXmlNSPrefix(name);
Map<String, String> nsDeclarations = node.getNamespaceDeclarations();
String nsURI = null;
if (prefix != null) {
name = Utils.getXmlName(name);
if (nsAware) {
if (nsDeclarations != null) {
nsURI = nsDeclarations.get(prefix);
}
if (nsURI == null) {
nsURI = node.getNamespaceURIOnPath(prefix);
}
if (nsURI == null) {
nsURI = prefix;
}
}
} else {
if (nsAware) {
if (nsDeclarations != null) {
nsURI = nsDeclarations.get("");
}
if (nsURI == null) {
nsURI = node.getNamespaceURIOnPath(prefix);
}
}
}
Element element;
if (nsAware && nsURI != null) {
Namespace ns = prefix == null ? Namespace.getNamespace(nsURI) : Namespace.getNamespace(prefix, nsURI);
element = factory.element(name, ns);
} else {
element = factory.element(name);
}
if (nsAware) {
defineNamespaceDeclarations(node, element);
}
return element;
}
private void defineNamespaceDeclarations(TagNode node, Element element) {
Map<String, String> nsDeclarations = node.getNamespaceDeclarations();
if (nsDeclarations != null) {
for (Map.Entry<String, String> nsEntry : nsDeclarations.entrySet()) {
String nsPrefix = nsEntry.getKey();
String nsURI = nsEntry.getValue();
Namespace ns = nsPrefix == null || "".equals(nsPrefix) ? Namespace.getNamespace(nsURI) : Namespace
.getNamespace(nsPrefix, nsURI);
element.addNamespaceDeclaration(ns);
}
}
}
private void setAttributes(TagNode node, Element element) {
for (Map.Entry<String, String> entry : node.getAttributes().entrySet()) {
String attrName = entry.getKey();
String attrValue = entry.getValue();
if (escapeXml) {
attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
attrValue = Utils.escapeXml(attrValue, props, true);
}
//
// Fix any invalid attribute names
//
if (!props.isAllowInvalidAttributeNames()){
attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix(),"");
}
//
// Note that even if we did want to allow invalid attribute names, JDom won't allow it
//
if (attrName != null && Utils.isValidXmlIdentifier(attrName)){
String attPrefix = Utils.getXmlNSPrefix(attrName);
Namespace ns = null;
if (attPrefix != null) {
attrName = Utils.getXmlName(attrName);
if (props.isNamespacesAware()) {
String nsURI = node.getNamespaceURIOnPath(attPrefix);
if (nsURI == null) {
nsURI = attPrefix;
}
if (!attPrefix.startsWith("xml")) {
ns = Namespace.getNamespace(attPrefix, nsURI);
}
}
}
//
// Don't manually add xmlns attributes as these should be
// handled automatically by JDOM through the namespace
// mechanism
//
if (!attrName.equals("xmlns")){
if (ns == null) {
element.setAttribute(attrName, attrValue);
} else {
element.setAttribute(attrName, attrValue, ns);
}
}
}
}
}
private void createSubnodes(Element element, List<? extends BaseToken> tagChildren) {
if (tagChildren != null) {
CDATA cdata = null;
//
// For script and style nodes, check if we're set to use CDATA
//
if (props.isUseCdataFor(element.getName())){
cdata = factory.cdata("");
element.addContent(factory.text(CSS_COMMENT_START));
element.addContent(cdata);
}
Iterator<? extends BaseToken> it = tagChildren.iterator();
while (it.hasNext()) {
Object item = it.next();
if (item instanceof CommentNode) {
CommentNode commentNode = (CommentNode) item;
Comment comment = factory.comment(commentNode.getContent().toString());
element.addContent(comment);
} else if (item instanceof ContentNode) {
String nodeName = element.getName();
String content = item.toString();
boolean specialCase = props.isUseCdataFor(nodeName);
if (escapeXml && !specialCase) {
content = Utils.escapeXml(content, props, true);
}
if (specialCase && item instanceof CData){
//
// For CDATA sections we don't want to return the start and
// end tokens. See issue #106.
//
content = ((CData)item).getContentWithoutStartAndEndTokens();
}
if (cdata != null){
cdata.append(content);
} else {
Text text = factory.text(content);
element.addContent(text);
}
} else if (item instanceof TagNode) {
TagNode subTagNode = (TagNode) item;
Element subelement = createElement(subTagNode);
setAttributes(subTagNode, subelement);
// recursively create subnodes
createSubnodes(subelement, subTagNode.getAllChildren());
element.addContent(subelement);
} else if (item instanceof List) {
List sublist = (List) item;
createSubnodes(element, sublist);
}
}
if (cdata != null){
if (!cdata.getText().startsWith(NEW_LINE)){
cdata.setText(CSS_COMMENT_END + NEW_LINE + cdata.getText());
} else {
cdata.setText(CSS_COMMENT_END + cdata.getText());
}
if (!cdata.getText().endsWith(NEW_LINE)){
cdata.append(NEW_LINE);
}
cdata.append(CSS_COMMENT_START);
element.addContent(factory.text(CSS_COMMENT_END));
}
}
}
}
@@ -0,0 +1,185 @@
package org.htmlcleaner;
import java.util.concurrent.ConcurrentMap;
/**It contains the MathML tags to use with Html5 tags
*
* @author User
*
*/
public class MathMLTagProvider {
private static final String CLOSE_BEFORE_TAGS = "menclose,mpadded,mphantom,mfenced,mstyle,merror,msqrt,mroot,maligngroup,malignmark,mlabeledtr,ms,mi,mo,mn,mfrac,mtext,mspace,mglyph,p,details,summary,menuitem,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
public MathMLTagProvider(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap) {
presentationMarkup(tagInfo,tagInfoMap);
}
public void presentationMarkup(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
tokenElements(tagInfo,tagInfoMap);
layoutElements(tagInfo,tagInfoMap);
scriptElements(tagInfo,tagInfoMap);
tableElements(tagInfo,tagInfoMap);
tagInfo = new TagInfo("maction", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("maction", tagInfo,tagInfoMap);
}
public void tokenElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
tagInfo = new TagInfo("mi", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mi", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mn", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mo", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mtext", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mtext", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mspace", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mspace", tagInfo,tagInfoMap);
tagInfo = new TagInfo("ms", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("ms", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mglyph", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mglyph", tagInfo,tagInfoMap);
}
public void layoutElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
tagInfo = new TagInfo("mrow", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mrow", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mfrac", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mfrac", tagInfo,tagInfoMap);
tagInfo = new TagInfo("msqrt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("msqrt", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mroot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mroot", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mstyle", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mstyle", tagInfo,tagInfoMap);
tagInfo = new TagInfo("merror", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("merror", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mpadded", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mpadded", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mphantom", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mphantom", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mfenced", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mfenced", tagInfo,tagInfoMap);
tagInfo = new TagInfo("menclose", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("menclose", tagInfo,tagInfoMap);
}
public void scriptElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
tagInfo = new TagInfo("msub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("msub", tagInfo,tagInfoMap);
tagInfo = new TagInfo("msup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("msup", tagInfo,tagInfoMap);
tagInfo = new TagInfo("msubsup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("msubsup", tagInfo,tagInfoMap);
tagInfo = new TagInfo("munder", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("munder", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mover", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mover", tagInfo,tagInfoMap);
tagInfo = new TagInfo("munderover", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("munderover", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mmultiscripts", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("mmultiscripts", tagInfo,tagInfoMap);
}
public void tableElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
tagInfo = new TagInfo("mtable", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineAllowedChildrenTags("mtr,mtd,mo,mn,mlabeledtr");
this.put("mtable", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mlabeledtr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineRequiredEnclosingTags("mtable");
tagInfo.defineFatalTags("mtable");
this.put("mlabeledtr", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mtr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
tagInfo.defineAllowedChildrenTags("mtd,mlabeledtr");
//tagInfo.defineRequiredEnclosingTags("mtable");
this.put("mtr", tagInfo,tagInfoMap);
tagInfo = new TagInfo("mtd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
//tagInfo.defineRequiredEnclosingTags("mtr");
//tagInfo.defineFatalTags("mtable");
this.put("mtd", tagInfo,tagInfoMap);
tagInfo = new TagInfo("maligngroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("maligngroup", tagInfo,tagInfoMap);
tagInfo = new TagInfo("malignmark", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
this.put("malignmark", tagInfo,tagInfoMap);
}
protected void put(String tagName, TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap) {
tagInfoMap.put(tagName, tagInfo);
}
public TagInfo getTagInfo(String tagName,ConcurrentMap<String, TagInfo> tagInfoMap) {
if ( tagName == null) {
return null;
} else {
return tagInfoMap.get(tagName);
}
}
}
@@ -0,0 +1,26 @@
package org.htmlcleaner;
/**
* Nesting State
* Wrapper for a current HtmlCleaner cleaning state, keeping together
* the set of open tags and breaks in the current state.
* @author scottw
*/
class NestingState {
private OpenTags openTags;
private ChildBreaks childBreaks;
public NestingState(OpenTags openTags, ChildBreaks childBreaks) {
this.openTags = openTags;
this.childBreaks = childBreaks;
}
public OpenTags getOpenTags() {
return this.openTags;
}
public ChildBreaks getChildBreaks() {
return this.childBreaks;
}
}
+133
View File
@@ -0,0 +1,133 @@
package org.htmlcleaner;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
/**
* Class that contains information and methods for managing list of open,
* but unhandled tags.
*/
class OpenTags {
/**
*
*/
private final HtmlCleaner htmlCleaner;
/**
* @param htmlCleaner
*/
OpenTags(HtmlCleaner htmlCleaner) {
this.htmlCleaner = htmlCleaner;
}
List<TagPos> list = new ArrayList<TagPos>();
private TagPos last;
private Set<String> set = new HashSet<String>();
boolean isEmpty() {
return list.isEmpty();
}
void addTag(String tagName, TagInfo tagInfo, int position, CleanTimeValues cleanTimeValues) {
last = new TagPos(position, tagName, tagInfo, cleanTimeValues);
list.add(last);
set.add(tagName);
}
void removeTag(String tagName) {
ListIterator<TagPos> it = list.listIterator( list.size() );
while ( it.hasPrevious() ) {
if (Thread.currentThread().isInterrupted()) {
this.htmlCleaner.handleInterruption();
break;
}
TagPos currTagPos = it.previous();
if (tagName.equals(currTagPos.name)) {
it.remove();
break;
}
}
last = list.isEmpty() ? null : (TagPos) list.get( list.size() - 1 );
}
TagPos findFirstTagPos() {
return list.isEmpty() ? null : (TagPos) list.get(0);
}
TagPos getLastTagPos() {
return last;
}
TagPos findTag(String tagName, CleanTimeValues cleanTimeValues) {
if (tagName != null) {
ListIterator<TagPos> it = list.listIterator(list.size());
String fatalTag = null;
TagInfo fatalInfo = this.htmlCleaner.getTagInfo(tagName, cleanTimeValues);
while (it.hasPrevious()) {
if (Thread.currentThread().isInterrupted()) {
this.htmlCleaner.handleInterruption();
return null;
}
TagPos currTagPos = it.previous();
if (tagName.equals(currTagPos.name)) {
return currTagPos;
} else if (fatalInfo != null && fatalInfo.isFatalTag(currTagPos.name)) {
// do not search past a fatal tag for this tag
return null;
}
}
}
return null;
}
boolean tagExists(String tagName, CleanTimeValues cleanTimeValues) {
TagPos tagPos = findTag(tagName, cleanTimeValues);
return tagPos != null;
}
TagPos findTagToPlaceRubbish() {
TagPos result = null, prev = null;
if ( !isEmpty() ) {
ListIterator<TagPos> it = list.listIterator( list.size() );
while ( it.hasPrevious() ) {
if (Thread.currentThread().isInterrupted()) {
this.htmlCleaner.handleInterruption();
return null;
}
result = it.previous();
if ( result.info == null || result.info.allowsAnything() ) {
if (prev != null) {
return prev;
}
}
prev = result;
}
}
return result;
}
boolean tagEncountered(String tagName) {
return set.contains(tagName);
}
/**
* Checks if any of tags specified in the set are already open.
* @param tags
*/
boolean someAlreadyOpen(Set<String> tags) {
for (TagPos curr : list) {
if ( tags.contains(curr.name) ) {
return true;
}
}
return false;
}
}
@@ -0,0 +1,20 @@
package org.htmlcleaner;
/**
*
*
*/
public enum OptionalOutput {
/**
* Never outputed even if supplied in the source.
*/
omit,
/**
* outputed ONLY if supplied in the source.
*/
preserve,
/**
* Always outputed, if information is not supplied in the source a default is created.
*/
alwaysOutput;
}
@@ -0,0 +1,221 @@
/* Copyright (c) 2006-2013, HtmlCleaner project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.*;
import java.util.*;
/**
* <p>Pretty HTML serializer - creates resulting HTML with indenting lines.</p>
*/
public class PrettyHtmlSerializer extends HtmlSerializer {
private static final String DEFAULT_INDENTATION_STRING = "\t";
private String indentString = DEFAULT_INDENTATION_STRING;
private List<String> indents = new ArrayList<String>();
public PrettyHtmlSerializer(CleanerProperties props) {
this(props, DEFAULT_INDENTATION_STRING);
}
public PrettyHtmlSerializer(CleanerProperties props, String indentString) {
super(props);
this.indentString = indentString;
}
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializePrettyHtml(tagNode, writer, 0, false, true);
}
/**
* @param level
* @return Appropriate indentation for the specified depth.
*/
private synchronized String getIndent(int level) {
int size = indents.size();
if (size <= level) {
String prevIndent = size == 0 ? null : indents.get(size - 1);
for (int i = size; i <= level; i++) {
String currIndent = prevIndent == null ? "" : prevIndent + indentString;
indents.add(currIndent);
prevIndent = currIndent;
}
}
return indents.get(level);
}
private String getIndentedText(String content, int level) {
String indent = getIndent(level);
StringBuilder result = new StringBuilder( content.length() );
StringTokenizer tokenizer = new StringTokenizer(content, "\n\r");
while (tokenizer.hasMoreTokens()) {
String line = tokenizer.nextToken().trim();
if (!"".equals(line)) {
result.append(indent).append(line).append("\n");
}
}
return result.toString();
}
private String getSingleLineOfChildren(List<? extends BaseToken> children) {
StringBuilder result = new StringBuilder();
Iterator<? extends BaseToken> childrenIt = children.iterator();
boolean isFirst = true;
while (childrenIt.hasNext()) {
Object child = childrenIt.next();
if ( !(child instanceof ContentNode) ) {
return null;
} else {
String content = child.toString();
//
// Removed the trim function as this has the potential
// to cause issues with actual content without adding
// any value
//
/*
// if first item trims it from left
if (isFirst) {
content = Utils.ltrim(content);
}
// if last item trims it from right
if (!childrenIt.hasNext()) {
content = Utils.rtrim(content);
}
*/
if ( content.indexOf("\n") >= 0 || content.indexOf("\r") >= 0 ) {
return null;
}
result.append(content);
}
isFirst = false;
}
return result.toString();
}
protected void serializePrettyHtml(TagNode tagNode, Writer writer, int level, boolean isPreserveWhitespaces, boolean isLastNewLine) throws IOException {
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
String tagName = tagNode.getName();
boolean isHeadlessNode = Utils.isEmptyString(tagName);
String indent = isHeadlessNode ? "" : getIndent(level);
if (!isPreserveWhitespaces) {
if (!isLastNewLine) {
writer.write("\n");
}
writer.write(indent);
}
serializeOpenTag(tagNode, writer, true);
boolean preserveWhitespaces = isPreserveWhitespaces || "pre".equalsIgnoreCase(tagName);
boolean lastWasNewLine = false;
if ( !isMinimizedTagSyntax(tagNode) ) {
String singleLine = getSingleLineOfChildren(tagChildren);
boolean dontEscape = dontEscape(tagNode);
if (!preserveWhitespaces && singleLine != null) {
writer.write( !dontEscape(tagNode) ? escapeText(singleLine) : singleLine );
} else {
Iterator<? extends BaseToken> childIterator = tagChildren.iterator();
while (childIterator.hasNext()) {
Object child = childIterator.next();
if (child instanceof TagNode) {
serializePrettyHtml((TagNode)child, writer, isHeadlessNode ? level : level + 1, preserveWhitespaces, lastWasNewLine);
lastWasNewLine = false;
} else if (child instanceof ContentNode) {
String content = dontEscape ? child.toString() : escapeText(child.toString());
if (content.length() > 0) {
if (dontEscape || preserveWhitespaces) {
writer.write(content);
} else if (Character.isWhitespace(content.charAt(0))) {
if (!lastWasNewLine) {
writer.write("\n");
lastWasNewLine = false;
}
if (content.trim().length() > 0) {
writer.write( getIndentedText(Utils.rtrim(content), isHeadlessNode ? level : level + 1) );
} else {
lastWasNewLine = true;
}
} else {
if (content.trim().length() > 0) {
writer.write(Utils.rtrim(content));
}
if (!childIterator.hasNext()) {
writer.write("\n");
lastWasNewLine = true;
}
}
}
} else if (child instanceof CommentNode) {
if (!lastWasNewLine && !preserveWhitespaces) {
writer.write("\n");
lastWasNewLine = false;
}
CommentNode commentNode = (CommentNode) child;
String content = commentNode.getCommentedContent();
writer.write( dontEscape ? content : getIndentedText(content, isHeadlessNode ? level : level + 1) );
}
}
}
if (singleLine == null && !preserveWhitespaces) {
if (!lastWasNewLine) {
writer.write("\n");
}
writer.write(indent);
}
serializeEndTag(tagNode, writer, false);
}
}
}
@@ -0,0 +1,217 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.*;
/**
* <p>Pretty XML serializer - creates resulting XML with indenting lines.</p>
*/
public class PrettyXmlSerializer extends XmlSerializer {
private static final String DEFAULT_INDENTATION_STRING = "\t";
private String indentString = DEFAULT_INDENTATION_STRING;
private List<String> indents = new ArrayList<String>();
public PrettyXmlSerializer(CleanerProperties props) {
this(props, DEFAULT_INDENTATION_STRING);
}
public PrettyXmlSerializer(CleanerProperties props, String indentString) {
super(props);
this.indentString = indentString;
}
@Override
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializePrettyXml(tagNode, writer, 0);
}
/**
* @param level
* @return Appropriate indentation for the specified depth.
*/
private synchronized String getIndent(int level) {
int size = indents.size();
if (size <= level) {
String prevIndent = size == 0 ? null : indents.get(size - 1);
for (int i = size; i <= level; i++) {
String currIndent = prevIndent == null ? "" : prevIndent + indentString;
indents.add(currIndent);
prevIndent = currIndent;
}
}
return indents.get(level);
}
private String getIndentedText(String content, int level) {
String indent = getIndent(level);
StringBuilder result = new StringBuilder( content.length() );
StringTokenizer tokenizer = new StringTokenizer(content, "\n\r");
while (tokenizer.hasMoreTokens()) {
String line = tokenizer.nextToken().trim();
if (!"".equals(line)) {
result.append(indent).append(line).append("\n");
}
}
return result.toString();
}
private String getSingleLineOfChildren(List<? extends BaseToken> children) {
StringBuilder result = new StringBuilder();
Iterator<? extends BaseToken> childrenIt = children.iterator();
boolean isFirst = true;
while (childrenIt.hasNext()) {
Object child = childrenIt.next();
if ( !(child instanceof ContentNode) ) {
return null;
} else {
String content = child.toString();
// if first item trims it from left
if (isFirst) {
content = ltrim(content);
}
// if last item trims it from right
if (!childrenIt.hasNext()) {
content = rtrim(content);
}
if ( content.indexOf("\n") >= 0 || content.indexOf("\r") >= 0 ) {
return null;
}
result.append(content);
}
isFirst = false;
}
return result.toString();
}
protected void serializePrettyXml(TagNode tagNode, Writer writer, int level) throws IOException {
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
boolean isHeadlessNode = Utils.isEmptyString(tagNode.getName());
String indent = isHeadlessNode ? "" : getIndent(level);
writer.write(indent);
serializeOpenTag(tagNode, writer, true);
if ( !isMinimizedTagSyntax(tagNode) ) {
String singleLine = getSingleLineOfChildren(tagChildren);
boolean dontEscape = dontEscape(tagNode);
if (singleLine != null) {
if ( !dontEscape(tagNode) ) {
writer.write( escapeXml(singleLine) );
} else {
writer.write( singleLine.replaceAll("]]>", "]]&gt;") );
}
} else {
if (!isHeadlessNode) {
writer.write("\n");
}
for (Object child: tagChildren) {
if (child instanceof TagNode) {
serializePrettyXml( (TagNode)child, writer, isHeadlessNode ? level : level + 1 );
} else if (child instanceof CData){
serializeCData((CData)child, tagNode, writer);
} else if (child instanceof ContentNode) {
String content = dontEscape ? child.toString().replaceAll("]]>", "]]&gt;") : escapeXml(child.toString());
writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) );
} else if (child instanceof CommentNode) {
CommentNode commentNode = (CommentNode) child;
String content = commentNode.getCommentedContent();
writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) );
}
}
}
if (singleLine == null) {
writer.write(indent);
}
serializeEndTag(tagNode, writer, true);
}
}
/**
* Trims specified string from left.
* @param s
*/
private String ltrim(String s) {
if (s == null) {
return null;
}
int index = 0;
int len = s.length();
while ( index < len && Character.isWhitespace(s.charAt(index)) ) {
index++;
}
return (index >= len) ? "" : s.substring(index);
}
/**
* Trims specified string from right.
* @param s
*/
private String rtrim(String s) {
if (s == null) {
return null;
}
int len = s.length();
int index = len;
while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) {
index--;
}
return (index <= 0) ? "" : s.substring(0, index);
}
}
@@ -0,0 +1,49 @@
package org.htmlcleaner;
/**
* A {@link TagNode} that only really holds whitespace or comments - allows
* using {@link ContentNode} in places where a {@link TagNode} is expected.
* <p/>
* This class is currently just a short-lived intermediate artifact generated
* from {@link HtmlCleaner} while cleaning an html file and descarded
* before the results are returned.
*
* @author andyhot
*/
class ProxyTagNode extends TagNode {
private ContentNode token;
private CommentNode comment;
private TagNode bodyNode;
public ProxyTagNode(ContentNode token, TagNode bodyNode) {
super("");
this.token = token;
this.bodyNode = bodyNode;
}
public ProxyTagNode(CommentNode comment, TagNode bodyNode) {
super("");
this.comment = comment;
this.bodyNode = bodyNode;
}
@Override
public TagNode getParent() {
return null;
}
@Override
public boolean removeFromTree() {
bodyNode.removeChild(getToken());
return true;
}
public BaseToken getToken() {
return token!=null ? token : comment;
}
public String getContent() {
return token!=null ? token.getContent() : comment.getContent();
}
}
@@ -0,0 +1,273 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.*;
import java.util.*;
/**
* <p>Basic abstract serializer - contains common logic for descendants (methods <code>writeXXX()</code>.</p>
*/
public abstract class Serializer {
/**
* Used to implement serialization with missing envelope - omiting open and close tags, just
* serialize children.
*/
private class HeadlessTagNode extends TagNode {
private HeadlessTagNode(TagNode wrappedNode) {
super("");
getAttributes().putAll(wrappedNode.getAttributes());
addChildren(wrappedNode.getAllChildren());
setDocType(wrappedNode.getDocType());
Map<String, String> nsDecls = getNamespaceDeclarations();
if (nsDecls != null) {
Map<String, String> wrappedNSDecls = wrappedNode.getNamespaceDeclarations();
if (wrappedNSDecls != null) {
nsDecls.putAll(wrappedNSDecls);
}
}
}
}
protected CleanerProperties props;
protected Serializer(CleanerProperties props) {
this.props = props;
}
/**
* Writes specified TagNode to the output stream, using specified charset and optionally omits node envelope
* (skips open and close tags of the node).
* @param tagNode Node to be written
* @param out Output stream
* @param charset Charset of the output
* @param omitEnvelope Tells whether to skip open and close tag of the node.
* @throws IOException
*/
public void writeToStream(TagNode tagNode, OutputStream out, String charset, boolean omitEnvelope) throws IOException {
write( tagNode, new OutputStreamWriter(out, charset), charset, omitEnvelope );
}
/**
* Writes specified TagNode to the output stream, using specified charset.
* @param tagNode Node to be written
* @param out Output stream
* @param charset Charset of the output
* @throws IOException
*/
public void writeToStream(TagNode tagNode, OutputStream out, String charset) throws IOException {
writeToStream(tagNode, out, charset, false);
}
/**
* Writes specified TagNode to the output stream, using system default charset and optionally omits node envelope
* (skips open and close tags of the node).
* @param tagNode Node to be written
* @param out Output stream
* @param omitEnvelope Tells whether to skip open and close tag of the node.
* @throws IOException
*/
public void writeToStream(TagNode tagNode, OutputStream out, boolean omitEnvelope) throws IOException {
writeToStream( tagNode, out, props.getCharset(), omitEnvelope );
}
/**
* Writes specified TagNode to the output stream, using system default charset.
* @param tagNode Node to be written
* @param out Output stream
* @throws IOException
*/
public void writeToStream(TagNode tagNode, OutputStream out) throws IOException {
writeToStream(tagNode, out, false);
}
/**
* Writes specified TagNode to the file, using specified charset and optionally omits node envelope
* (skips open and close tags of the node).
* @param tagNode Node to be written
* @param fileName Output file name
* @param charset Charset of the output
* @param omitEnvelope Tells whether to skip open and close tag of the node.
* @throws IOException
*/
public void writeToFile(TagNode tagNode, String fileName, String charset, boolean omitEnvelope) throws IOException {
writeToStream(tagNode, new FileOutputStream(fileName), charset, omitEnvelope );
}
/**
* Writes specified TagNode to the file, using specified charset.
* @param tagNode Node to be written
* @param fileName Output file name
* @param charset Charset of the output
* @throws IOException
*/
public void writeToFile(TagNode tagNode, String fileName, String charset) throws IOException {
writeToFile(tagNode, fileName, charset, false);
}
/**
* Writes specified TagNode to the file, using specified charset and optionally omits node envelope
* (skips open and close tags of the node).
* @param tagNode Node to be written
* @param fileName Output file name
* @param omitEnvelope Tells whether to skip open and close tag of the node.
* @throws IOException
*/
public void writeToFile(TagNode tagNode, String fileName, boolean omitEnvelope) throws IOException {
writeToFile(tagNode,fileName, props.getCharset(), omitEnvelope);
}
/**
* Writes specified TagNode to the file, using system default charset.
* @param tagNode Node to be written
* @param fileName Output file name
* @throws IOException
*/
public void writeToFile(TagNode tagNode, String fileName) throws IOException {
writeToFile(tagNode, fileName, false);
}
/**
* @param tagNode Node to serialize to string
* @param charset Charset of the output - stands in xml declaration part
* @param omitEnvelope Tells whether to skip open and close tag of the node.
* @return Output as string
*/
public String getAsString(TagNode tagNode, String charset, boolean omitEnvelope) {
StringWriter writer = new StringWriter();
try {
write(tagNode, writer, charset, omitEnvelope);
} catch (IOException e) {
// not writing to the file system so any io errors should be really rare ( and bad)
throw new HtmlCleanerException(e);
}
return writer.getBuffer().toString();
}
/**
* @param tagNode Node to serialize to string
* @param charset Charset of the output - stands in xml declaration part
* @return Output as string
*/
public String getAsString(TagNode tagNode, String charset) {
return getAsString(tagNode, charset, false);
}
/**
* @param tagNode Node to serialize to string
* @param omitEnvelope Tells whether to skip open and close tag of the node.
* @return Output as string
* @throws IOException
*/
public String getAsString(TagNode tagNode, boolean omitEnvelope) {
return getAsString(tagNode, props.getCharset(), omitEnvelope);
}
/**
* @param tagNode Node to serialize to string
* @return Output as string
* @throws IOException
*/
public String getAsString(TagNode tagNode) {
return getAsString(tagNode, false);
}
public String getAsString(String htmlContent) {
HtmlCleaner htmlCleaner = new HtmlCleaner(this.props);
TagNode tagNode = htmlCleaner.clean(htmlContent);
return getAsString(tagNode, props.getCharset());
}
/**
* Writes specified node using specified writer.
* @param tagNode Node to serialize.
* @param writer Writer instance
* @param charset Charset of the output
* @throws IOException
*/
public void write(TagNode tagNode, Writer writer, String charset) throws IOException {
write(tagNode, writer, charset, false);
}
/**
* Writes specified node using specified writer.
* @param tagNode Node to serialize.
* @param writer Writer instance
* @param charset Charset of the output
* @param omitEnvelope Tells whether to skip open and close tag of the node.
* @throws IOException
*/
public void write(TagNode tagNode, Writer writer, String charset, boolean omitEnvelope) throws IOException {
if (omitEnvelope) {
tagNode = new HeadlessTagNode(tagNode);
}
writer = new BufferedWriter(writer);
if ( !props.isOmitXmlDeclaration() ) {
String declaration = "<?xml version=\"1.0\"";
if (charset != null) {
declaration += " encoding=\"" + charset + "\"";
}
declaration += "?>";
writer.write(declaration + "\n");
}
if ( !props.isOmitDoctypeDeclaration() ) {
DoctypeToken doctypeToken = tagNode.getDocType();
if ( doctypeToken != null ) {
doctypeToken.serialize(this, writer);
}
}
serialize(tagNode, writer);
writer.flush();
writer.close();
}
protected boolean isScriptOrStyle(TagNode tagNode) {
String tagName = tagNode.getName();
return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
}
protected abstract void serialize(TagNode tagNode, Writer writer) throws IOException;
}
@@ -0,0 +1,75 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.*;
/**
* <p>Simple HTML serializer - creates resulting HTML without indenting and/or compacting.</p>
*/
public class SimpleHtmlSerializer extends HtmlSerializer {
boolean escape = true;
public SimpleHtmlSerializer(CleanerProperties props, boolean escape) {
super(props);
this.escape = escape;
}
public SimpleHtmlSerializer(CleanerProperties props) {
super(props);
}
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializeOpenTag(tagNode, writer, false);
if ( !isMinimizedTagSyntax(tagNode) ) {
for (Object item: tagNode.getAllChildren()) {
if ( item instanceof ContentNode) {
String content = item.toString();
writer.write( dontEscape(tagNode) || !escape ? content : escapeText(content) );
} else if (item instanceof BaseToken) {
((BaseToken)item).serialize(this, writer);
}
}
serializeEndTag(tagNode, writer, false);
}
}
}
@@ -0,0 +1,79 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.List;
/**
* <p>Simple XML serializer - creates resulting XML without indenting lines.</p>
*/
public class SimpleXmlSerializer extends XmlSerializer {
public SimpleXmlSerializer(CleanerProperties props) {
super(props);
}
@Override
protected void serialize(TagNode tagNode, Writer writer) throws IOException {
serializeOpenTag(tagNode, writer, false);
List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
if ( !isMinimizedTagSyntax(tagNode) ) {
Iterator<? extends BaseToken> childrenIt = tagChildren.iterator();
while ( childrenIt.hasNext() ) {
Object item = childrenIt.next();
if (item != null) {
if (item instanceof CData) {
serializeCData((CData)item, tagNode, writer);
} else if ( item instanceof ContentNode ) {
serializeContentToken((ContentNode)item, tagNode, writer);
} else {
((BaseToken)item).serialize(this, writer);
}
}
}
serializeEndTag(tagNode, writer, false);
}
}
}
@@ -0,0 +1,495 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.HashMap;
import java.util.Map;
/**
* <p>This class contains map with special entities used in HTML and their
* unicodes.</p>
*
* Created by: Vladimir Nikic<br/>
* Date: November, 2006.
*/
public class SpecialEntities {
public static final SpecialEntities INSTANCE = new SpecialEntities(true, true) {
@Override
public void put(SpecialEntity specialEntity) {
throw new UnsupportedOperationException("cannot add to this instance");
}
};
/**
* key is the {@link SpecialEntity#getKey()} ( i.e. "quot" )
*/
private Map<String, SpecialEntity> entities = new HashMap<String, SpecialEntity>();
/**
* Key is the Integer returned by {@link SpecialEntity#intValue()}
*/
private Map<Integer, SpecialEntity> entitiesByUnicodeCharcode = new HashMap<Integer, SpecialEntity>();
private boolean greek;
private boolean math;
private int maxEntityLength;
public static final char NON_BREAKABLE_SPACE = 160;
public SpecialEntities(boolean greek, boolean math) {
this.greek = greek;
this.math = math;
_put(new SpecialEntity("null", 0, "", true));
_put(new SpecialEntity("nbsp", NON_BREAKABLE_SPACE, null, true));
_put(new SpecialEntity("iexcl", 161, null, true));
_put(new SpecialEntity("cent", 162, null, true));
_put(new SpecialEntity("pound", 163, null, true));
_put(new SpecialEntity("curren", 164, null, true));
_put(new SpecialEntity("yen", 165, null, true));
_put(new SpecialEntity("brvbar", 166, null, true));
_put(new SpecialEntity("sect", 167, null, true));
_put(new SpecialEntity("uml", 168, null, true));
_put(new SpecialEntity("copy", 169, null, true));
_put(new SpecialEntity("ordf", 170, null, true));
_put(new SpecialEntity("laquo", 171, null, true));
_put(new SpecialEntity("not", 172, null, true));
_put(new SpecialEntity("shy", 173, null, true));
_put(new SpecialEntity("reg", 174, null, true));
_put(new SpecialEntity("macr", 175, null, true));
_put(new SpecialEntity("deg", 176, null, true));
_put(new SpecialEntity("plusmn", 177, null, true));
_put(new SpecialEntity("sup2", 178, null, true));
_put(new SpecialEntity("sup3", 179, null, true));
_put(new SpecialEntity("acute", 180, null, true));
_put(new SpecialEntity("micro", 181, null, true));
_put(new SpecialEntity("para", 182, null, true));
_put(new SpecialEntity("middot", 183, null, true));
_put(new SpecialEntity("cedil", 184, null, true));
_put(new SpecialEntity("sup1", 185, null, true));
_put(new SpecialEntity("ordm", 186, null, true));
_put(new SpecialEntity("raquo", 187, null, true));
_put(new SpecialEntity("frac14", 188, null, true));
_put(new SpecialEntity("frac12", 189, null, true));
_put(new SpecialEntity("frac34", 190, null, true));
_put(new SpecialEntity("iquest", 191, null, true));
_put(new SpecialEntity("Agrave", 192, null, true));
_put(new SpecialEntity("Aacute", 193, null, true));
_put(new SpecialEntity("Acirc", 194, null, true));
_put(new SpecialEntity("Atilde", 195, null, true));
_put(new SpecialEntity("Auml", 196, null, true));
_put(new SpecialEntity("Aring", 197, null, true));
_put(new SpecialEntity("AElig", 198, null, true));
_put(new SpecialEntity("Ccedil", 199, null, true));
_put(new SpecialEntity("Egrave", 200, null, true));
_put(new SpecialEntity("Eacute", 201, null, true));
_put(new SpecialEntity("Ecirc", 202, null, true));
_put(new SpecialEntity("Euml", 203, null, true));
_put(new SpecialEntity("Igrave", 204, null, true));
_put(new SpecialEntity("Iacute", 205, null, true));
_put(new SpecialEntity("Icirc", 206, null, true));
_put(new SpecialEntity("Iuml", 207, null, true));
_put(new SpecialEntity("ETH", 208, null, true));
_put(new SpecialEntity("Ntilde", 209, null, true));
_put(new SpecialEntity("Ograve", 210, null, true));
_put(new SpecialEntity("Oacute", 211, null, true));
_put(new SpecialEntity("Ocirc", 212, null, true));
_put(new SpecialEntity("Otilde", 213, null, true));
_put(new SpecialEntity("Ouml", 214, null, true));
_put(new SpecialEntity("times", 215, null, true));
_put(new SpecialEntity("Oslash", 216, null, true));
_put(new SpecialEntity("Ugrave", 217, null, true));
_put(new SpecialEntity("Uacute", 218, null, true));
_put(new SpecialEntity("Ucirc", 219, null, true));
_put(new SpecialEntity("Uuml", 220, null, true));
_put(new SpecialEntity("Yacute", 221, null, true));
_put(new SpecialEntity("THORN", 222, null, true));
_put(new SpecialEntity("szlig", 223, null, true));
_put(new SpecialEntity("agrave", 224, null, true));
_put(new SpecialEntity("aacute", 225, null, true));
_put(new SpecialEntity("acirc", 226, null, true));
_put(new SpecialEntity("atilde", 227, null, true));
_put(new SpecialEntity("auml", 228, null, true));
_put(new SpecialEntity("aring", 229, null, true));
_put(new SpecialEntity("aelig", 230, null, true));
_put(new SpecialEntity("ccedil", 231, null, true));
_put(new SpecialEntity("egrave", 232, null, true));
_put(new SpecialEntity("eacute", 233, null, true));
_put(new SpecialEntity("ecirc", 234, null, true));
_put(new SpecialEntity("euml", 235, null, true));
_put(new SpecialEntity("igrave", 236, null, true));
_put(new SpecialEntity("iacute", 237, null, true));
_put(new SpecialEntity("icirc", 238, null, true));
_put(new SpecialEntity("iuml", 239, null, true));
_put(new SpecialEntity("eth", 240, null, true));
_put(new SpecialEntity("ntilde", 241, null, true));
_put(new SpecialEntity("ograve", 242, null, true));
_put(new SpecialEntity("oacute", 243, null, true));
_put(new SpecialEntity("ocirc", 244, null, true));
_put(new SpecialEntity("otilde", 245, null, true));
_put(new SpecialEntity("ouml", 246, null, true));
_put(new SpecialEntity("divide", 247, null, true));
_put(new SpecialEntity("oslash", 248, null, true));
_put(new SpecialEntity("ugrave", 249, null, true));
_put(new SpecialEntity("uacute", 250, null, true));
_put(new SpecialEntity("ucirc", 251, null, true));
_put(new SpecialEntity("uuml", 252, null, true));
_put(new SpecialEntity("yacute", 253, null, true));
_put(new SpecialEntity("thorn", 254, null, true));
_put(new SpecialEntity("yuml", 255, null, true));
_put(new SpecialEntity("OElig", 338, null, true));
_put(new SpecialEntity("oelig", 339, null, true));
_put(new SpecialEntity("Scaron", 352, null, true));
_put(new SpecialEntity("scaron", 353, null, true));
_put(new SpecialEntity("Yuml", 376, null, true));
_put(new SpecialEntity("fnof", 402, null, true));
_put(new SpecialEntity("circ", 710, null, true));
_put(new SpecialEntity("tilde", 732, null, true));
if ( this.greek ) {
// 913 Alpha Α greek capital letter alpha
_put(new SpecialEntity("Alpha", 913, null, true));
// 914 Beta Β greek capital letter beta
_put(new SpecialEntity("Beta", 914, null, true));
// 915 Gamma Γ greek capital letter gamma
_put(new SpecialEntity("Gamma", 915, null, true));
// 916 Delta Δ greek capital letter delta
_put(new SpecialEntity("Delta", 916, null, true));
// 917 Epsilon Ε greek capital letter epsilon
_put(new SpecialEntity("Epsilon", 917, null, true));
// 918 Zeta Ζ greek capital letter zeta
_put(new SpecialEntity("Zeta", 918, null, true));
// 919 Eta Η greek capital letter eta
_put(new SpecialEntity("Eta", 919, null, true));
// 920 Theta Θ greek capital letter theta
_put(new SpecialEntity("Theta", 920, null, true));
// 921 Iota Ι greek capital letter iota
_put(new SpecialEntity("Iota", 921, null, true));
// 922 Kappa Κ greek capital letter kappa
_put(new SpecialEntity("Kappa", 922, null, true));
// 923 Lambda Λ greek capital letter lambda
_put(new SpecialEntity("Lambda", 923, null, true));
// 924 Mu Μ greek capital letter mu
_put(new SpecialEntity("Mu", 924, null, true));
// 925 Nu Ν greek capital letter nu
_put(new SpecialEntity("Nu", 925, null, true));
// 926 Xi Ξ greek capital letter xi
_put(new SpecialEntity("Xi", 926, null, true));
// 927 Omicron Ο greek capital letter omicron
_put(new SpecialEntity("Omicron", 927, null, true));
// 928 Pi Π greek capital letter pi
_put(new SpecialEntity("Pi", 928, null, true));
// 929 Rho Ρ greek capital letter rho
_put(new SpecialEntity("Rho", 929, null, true));
// there is no Sigmaf, and no U+03A2 character either
// 931 Sigma Σ greek capital letter sigma
_put(new SpecialEntity("Sigma", 931, null, true));
// 932 Tau Τ greek capital letter tau
_put(new SpecialEntity("Tau", 932, null, true));
// 933 Upsilon Υ greek capital letter upsilon
_put(new SpecialEntity("Upsilon", 933, null, true));
// 934 Phi Φ greek capital letter phi
_put(new SpecialEntity("Phi", 934, null, true));
// 935 Chi Χ greek capital letter chi
_put(new SpecialEntity("Chi", 935, null, true));
// 936 Psi Ψ greek capital letter psi
_put(new SpecialEntity("Psi", 936, null, true));
// 937 Omega Ω greek capital letter omega
_put(new SpecialEntity("Omega", 937, null, true));
// 945 alpha α greek small letter alpha
_put(new SpecialEntity("alpha", 945, null, true));
// 946 beta β greek small letter beta
_put(new SpecialEntity("beta", 946, null, true));
// 947 gamma γ greek small letter gamma
_put(new SpecialEntity("gamma", 947, null, true));
// 948 delta δ greek small letter delta
_put(new SpecialEntity("delta", 948, null, true));
// 949 epsilon ε greek small letter epsilon
_put(new SpecialEntity("epsilon", 949, null, true));
// 950 zeta ζ greek small letter zeta
_put(new SpecialEntity("zeta", 950, null, true));
// 951 eta η greek small letter eta
_put(new SpecialEntity("eta", 951, null, true));
// 952 theta θ greek small letter theta
_put(new SpecialEntity("theta", 952, null, true));
// 953 iota ι greek small letter iota
_put(new SpecialEntity("iota", 953, null, true));
// 954 kappa κ greek small letter kappa
_put(new SpecialEntity("kappa", 954, null, true));
// 955 lambda λ greek small letter lambda
_put(new SpecialEntity("lambda", 955, null, true));
// 956 mu μ greek small letter mu
_put(new SpecialEntity("mu", 956, null, true));
// 957 nu ν greek small letter nu
_put(new SpecialEntity("nu", 957, null, true));
// 958 xi ξ greek small letter xi
_put(new SpecialEntity("xi", 958, null, true));
// 959 omicron ο greek small letter omicron
_put(new SpecialEntity("omicron", 959, null, true));
// 960 pi π greek small letter pi
_put(new SpecialEntity("pi", 960, null, true));
// 961 rho ρ greek small letter rho
_put(new SpecialEntity("rho", 961, null, true));
// 962 sigmaf ς greek small letter final sigma
_put(new SpecialEntity("sigmaf", 962, null, true));
// 963 sigma σ greek small letter sigma
_put(new SpecialEntity("sigma", 963, null, true));
// 964 tau τ greek small letter tau
_put(new SpecialEntity("tau", 964, null, true));
// 965 upsilon υ greek small letter upsilon
_put(new SpecialEntity("upsilon", 965, null, true));
// 966 phi φ greek small letter phi
_put(new SpecialEntity("phi", 966, null, true));
// 967 chi χ greek small letter chi
_put(new SpecialEntity("chi", 967, null, true));
// 968 psi ψ greek small letter psi
_put(new SpecialEntity("psi", 968, null, true));
// 969 omega ω greek small letter omega
_put(new SpecialEntity("omega", 969, null, true));
// 977 thetasym ϑ greek small letter theta symbol
_put(new SpecialEntity("thetasym", 977, null, true));
// 978 upsih ϒ greek upsilon with hook symbol
_put(new SpecialEntity("upsih", 978, null, true));
// 982 piv ϖ greek pi symbol
_put(new SpecialEntity("piv", 982, null, true));
}
_put(new SpecialEntity("ensp", 8194, null, true));
_put(new SpecialEntity("emsp", 8195, null, true));
_put(new SpecialEntity("thinsp", 8201, null, true));
_put(new SpecialEntity("zwnj", 8204, null, true));
_put(new SpecialEntity("zwj", 8205, null, true));
_put(new SpecialEntity("lrm", 8206, null, true));
_put(new SpecialEntity("rlm", 8207, null, true));
_put(new SpecialEntity("ndash", 8211, null, true));
_put(new SpecialEntity("mdash", 8212, null, true));
_put(new SpecialEntity("lsquo", 8216, null, true));
_put(new SpecialEntity("rsquo", 8217, null, true));
_put(new SpecialEntity("sbquo", 8218, null, true));
_put(new SpecialEntity("ldquo", 8220, null, true));
_put(new SpecialEntity("rdquo", 8221, null, true));
_put(new SpecialEntity("bdquo", 8222, null, true));
_put(new SpecialEntity("dagger", 8224, null, true));
_put(new SpecialEntity("Dagger", 8225, null, true));
_put(new SpecialEntity("bull", 8226, null, true));
// three ellipses
_put(new SpecialEntity("hellip", 8230, null, true));
_put(new SpecialEntity("permil", 8240, null, true));
_put(new SpecialEntity("prime", 8242, null, true));
_put(new SpecialEntity("Prime", 8243, null, true));
_put(new SpecialEntity("lsaquo", 8249, null, true));
_put(new SpecialEntity("rsaquo", 8250, null, true));
_put(new SpecialEntity("oline", 8254, null, true));
_put(new SpecialEntity("frasl", 8260, null, true));
_put(new SpecialEntity("euro", 8364, null, true));
_put(new SpecialEntity("image", 8465, null, true));
_put(new SpecialEntity("weierp", 8472, null, true));
_put(new SpecialEntity("real", 8476, null, true));
_put(new SpecialEntity("trade", 8482, null, true));
_put(new SpecialEntity("alefsym", 8501, null, true));
_put(new SpecialEntity("larr", 8592, null, true));
_put(new SpecialEntity("uarr", 8593, null, true));
_put(new SpecialEntity("rarr", 8594, null, true));
_put(new SpecialEntity("darr", 8595, null, true));
_put(new SpecialEntity("harr", 8596, null, true));
_put(new SpecialEntity("crarr", 8629, null, true));
_put(new SpecialEntity("lArr", 8656, null, true));
_put(new SpecialEntity("uArr", 8657, null, true));
_put(new SpecialEntity("rArr", 8658, null, true));
_put(new SpecialEntity("dArr", 8659, null, true));
_put(new SpecialEntity("hArr", 8660, null, true));
if (this.math) {
// 8704 forall ∀ for all
_put(new SpecialEntity("forall", 8704, null, true));
//8706 part ∂ partial differential
_put(new SpecialEntity("part", 8706, null, true));
//8707 exist ∃ there exists
_put(new SpecialEntity("exist", 8707, null, true));
//8709 empty ∅ empty set = null set = diameter
_put(new SpecialEntity("empty", 8709, null, true));
//8711 nabla ∇ nabla = backward difference
_put(new SpecialEntity("nabla", 8711, null, true));
//8712 isin ∈ element of
_put(new SpecialEntity("isin", 8712, null, true));
//8713 notin ∉ not an element of
_put(new SpecialEntity("notin", 8713, null, true));
//8715 ni ∋ contains as member
_put(new SpecialEntity("ni", 8715, null, true));
//8719 prod ∏ n-ary product = product sign
//prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both
_put(new SpecialEntity("prod", 8719, null, true));
//8721 sum ∑ n-ary sumation
//sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both
_put(new SpecialEntity("sum", 8721, null, true));
//8722 minus minus sign
_put(new SpecialEntity("minus", 8722, null, true));
//8727 lowast asterisk operator
_put(new SpecialEntity("lowast", 8727, null, true));
//8730 radic √ square root = radical sign
_put(new SpecialEntity("radic", 8730, null, true));
//8733 prop ∝ proportional to
_put(new SpecialEntity("prop", 8733, null, true));
//8734 infin ∞ infinity
_put(new SpecialEntity("infin", 8734, null, true));
//8736 ang ∠ angle
_put(new SpecialEntity("ang", 8736, null, true));
//8743 and ∧ logical and = wedge
_put(new SpecialEntity("and", 8743, null, true));
//8744 or logical or = vee
_put(new SpecialEntity("or", 8744, null, true));
//8745 cap ∩ intersection = cap
_put(new SpecialEntity("cap", 8745, null, true));
//8746 cup union = cup
_put(new SpecialEntity("cup", 8746, null, true));
//8747 int ∫ integral
_put(new SpecialEntity("int", 8747, null, true));
//8756 there4 ∴ therefore
_put(new SpecialEntity("there4", 8756, null, true));
//8764 sim tilde operator = varies with = similar to
//tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both
_put(new SpecialEntity("sim", 8764, null, true));
//8773 cong ≅ approximately equal to
_put(new SpecialEntity("cong", 8773, null, true));
//8776 asymp ≈ almost equal to = asymptotic to
_put(new SpecialEntity("asymp", 8776, null, true));
//8800 ne ≠ not equal to
_put(new SpecialEntity("ne", 8800, null, true));
//8801 equiv ≡ identical to
_put(new SpecialEntity("equiv", 8801, null, true));
//8804 le ≤ less-than or equal to
_put(new SpecialEntity("le", 8804, null, true));
//8805 ge ≥ greater-than or equal to
_put(new SpecialEntity("ge", 8805, null, true));
//8834 sub ⊂ subset of
_put(new SpecialEntity("sub", 8834, null, true));
//8835 sup ⊃ superset of
_put(new SpecialEntity("sup", 8835, null, true));
//note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry? It is in ISOamsn
//8836 nsub ⊄ not a subset of
_put(new SpecialEntity("nsub", 8836, null, true));
//8838 sube ⊆ subset of or equal to
_put(new SpecialEntity("sube", 8838, null, true));
//8839 supe ⊇ superset of or equal to
_put(new SpecialEntity("supe", 8839, null, true));
//8853 oplus ⊕ circled plus = direct sum
_put(new SpecialEntity("oplus", 8853, null, true));
//8855 otimes ⊗ circled times = vector product
_put(new SpecialEntity("otimes", 8855, null, true));
//8869 perp ⊥ up tack = orthogonal to = perpendicular
_put(new SpecialEntity("perp", 8869, null, true));
//8901 sdot ⋅ dot operator
_put(new SpecialEntity("sdot", 8901, null, true));
//dot operator is NOT the same character as U+00B7 middle dot
//8968 lceil ⌈ left ceiling = apl upstile
_put(new SpecialEntity("lceil", 8968, null, true));
//8969 rceil ⌉ right ceiling
_put(new SpecialEntity("rceil", 8969, null, true));
//8970 lfloor ⌊ left floor = apl downstile
_put(new SpecialEntity("lfloor", 8970, null, true));
//8971 rfloor ⌋ right floor
_put(new SpecialEntity("rfloor", 8971, null, true));
//9001 lang 〈 left-pointing angle bracket = bra
//lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark'
_put(new SpecialEntity("lang", 9001, null, true));
//9002 rang 〉 right-pointing angle bracket = ket
//rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark'
_put(new SpecialEntity("rang", 9002, null, true));
//9674 loz ◊ lozenge
_put(new SpecialEntity("loz", 9674, null, true));
//black here seems to mean filled as opposed to hollow
//9824 spades ♠ black spade suit
_put(new SpecialEntity("spades", 9824, null, true));
//9827 clubs ♣ black club suit = shamrock
_put(new SpecialEntity("clubs", 9827, null, true));
//9829 hearts ♥ black heart suit = valentine
_put(new SpecialEntity("hearts", 9829, null, true));
//9830 diams ♦ black diamond suit
_put(new SpecialEntity("diams", 9830, null, true));
}
_put(new SpecialEntity("amp", '&', null, false));
_put(new SpecialEntity("lt", '<', null, false));
_put(new SpecialEntity("gt", '>', null, false));
_put(new SpecialEntity("quot", '"', null, false));
// this is xml only -- apos appearing in html needs to be converted to ' or maybe &#39; to be universally safe
// may need to special case for html attributes that use ' as surrounding delimeter on attribute value (instead of " ) : <a href='javascript:foo("bar'")' >wierd link</a>
_put(new SpecialEntity("apos", '\'', "'", false));
}
/**
*
* @param seq may have a leading & and/or trailing ; ( those will be removed prior to comparision)
* @return {@link SpecialEntity} if found.
*/
public SpecialEntity getSpecialEntity(String seq) {
if (seq.length() == 0) return null;
int startIndex = seq.charAt(0) == '&'?1:0;
int semiIndex = seq.indexOf(';');
String entity;
if (semiIndex < 0) {
entity = seq.substring(startIndex);
} else {
entity = seq.substring(startIndex, semiIndex);
}
SpecialEntity specialEntity = entities.get(entity);
return specialEntity;
}
public SpecialEntity getSpecialEntityByUnicode(int unicodeCharcode) {
return this.entitiesByUnicodeCharcode.get(unicodeCharcode);
}
public void put(SpecialEntity specialEntity) {
_put(specialEntity);
}
/**
* @param specialEntity
*/
private void _put(SpecialEntity specialEntity) {
SpecialEntity old;
old = entities.put(specialEntity.getKey(), specialEntity);
if ( old != null ) {
throw new HtmlCleanerException("replaced "+old+" with "+specialEntity);
}
old = entitiesByUnicodeCharcode.put(specialEntity.intValue(), specialEntity);
if ( old != null ) {
throw new HtmlCleanerException("replaced "+old+" with "+specialEntity);
}
this.maxEntityLength = Math.max(this.maxEntityLength,specialEntity.getKey().length());
}
public int getMaxEntityLength() {
return maxEntityLength;
}
}
@@ -0,0 +1,135 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
public class SpecialEntity{
private final String key;
private final int intCode;
// escaped value outputed when generating html
private final String htmlString;
private boolean htmlSpecialEntity;
// escaped value when outputting html
private final String escapedXmlString;
/**
*
* @param key value between & and the ';' example 'amp' for '&amp;'
* @param intCode
* @param htmlString
* @param htmlSpecialEntity entity is affected by translateSpecialEntities property setting.
*/
public SpecialEntity(String key, int intCode, String htmlString, boolean htmlSpecialEntity) {
this.key = key;
this.intCode = intCode;
String str = "&" + key +";";
if ( htmlString != null) {
this.htmlString = htmlString;
} else {
this.htmlString = str;
}
if ( htmlSpecialEntity ) {
this.escapedXmlString = String.valueOf((char)this.intCode);
} else {
this.escapedXmlString = str;
}
this.htmlSpecialEntity = htmlSpecialEntity;
}
/**
* @return the key
*/
public String getKey() {
return key;
}
/**
* @return the intCode
*/
public int intValue() {
return intCode;
}
/**
* @return the domString
*/
public String getHtmlString() {
return htmlString;
}
public String getEscapedXmlString() {
return this.escapedXmlString;
}
public String getEscaped(boolean htmlEscaped) {
return htmlEscaped?this.getHtmlString():this.getEscapedXmlString();
}
/**
* @return the translateSpecialEntities
*/
public boolean isHtmlSpecialEntity() {
return htmlSpecialEntity;
}
/**
* @return {@link #intValue()} cast to an char
*/
public char charValue() {
return (char) intValue();
}
/**
* @return Numeric Character Reference in decimal format
*/
public String getDecimalNCR() {
return "&#" + intCode + ";";
}
/**
* @return Numeric Character Reference in hex format
*/
public String getHexNCR() {
return "&#x" + Integer.toHexString(intCode) + ";";
}
/**
* @return Escaped value of the entity
*/
public String getEscapedValue() {
return "&" + key + ";";
}
}
+447
View File
@@ -0,0 +1,447 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.*;
/**
* <p>
* Class contains information about single HTML tag.<br/>
* It also contains rules for tag balancing. For each tag, list of dependent
* tags may be defined. There are several kinds of dependencies used to reorder
* tags:
* <ul>
* <li>
* fatal tags - required outer tag - the tag will be ignored during
* parsing (will be skipped) if this fatal tag is missing. For example, most web
* browsers ignore elements TD, TR, TBODY if they are not in the context of TABLE tag.
* </li>
* <li>
* required enclosing tags - if there is no such, it is implicitly
* created. For example if TD is out of TR - open TR is created before.
* </li>
* <li>
* forbidden tags - it is not allowed to occur inside - for example
* FORM cannot be inside other FORM and it will be ignored during cleanup.
* </li>
* <li>
* allowed children tags - for example TR allows TD and TH. If there
* are some dependent allowed tags defined then cleaner ignores other tags, treating
* them as not allowed, unless they are in some other relationship with this tag.
* </li>
* <li>
* preferred child tag - where a child tag doesn't match, but we want to by default
* insert an intervening tag rather than just move it outside. For example, LI in UL, TD in TR.
* </li>
* <li>
* higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT.
* </li>
* <li>
* tags that must be closed and copied - for example, in
* <code>&lt;a href="#"&gt;&lt;div&gt;....</code> tag A must be closed before DIV but
* copied again inside DIV.
* </li>
* <li>
* tags that must be closed before closing this tag and copied again after -
* for example, in <code>&lt;i&gt;&lt;b&gt;at&lt;/i&gt; first&lt;/b&gt; text </code>
* tag B must be closed before closing I, but it must be copied again after resulting
* finally in sequence: <code>&lt;i&gt;&lt;b&gt;at&lt;/b&gt;&lt;/i&gt;&lt;b&gt; first&lt;/b&gt; text </code>.
* </li>
* </ul>
* </p>
*
* <p>
* Tag TR for instance (table row) may define the following dependencies:
* <ul>
* <li>fatal tag is <code>table</code></li>
* <li>required enclosing tag is <code>tbody</code></li>
* <li>allowed children tags are <code>td,th</code></li>
* <li>higher level tags are <code>thead,tfoot</code></li>
* <li>tags that muste be closed before are <code>tr,td,th,caption,colgroup</code></li>
* </ul>
* meaning the following: <br>
* <ul>
* <li><code>tr</code> must be in context of <code>table</code>, otherwise it will be ignored,</li>
* <li><code>tr</code> may can be directly inside <code>tbody</code>, <code>tfoot</code> and <code>thead</code>,
* otherwise <code>tbody</code> will be implicitly created in front of it.</li>
* <li><code>tr</code> can contain <code>td</code> and <code>th</code>, all other tags and content will be pushed out of current
* limiting context, in the case of html tables, in front of enclosing <code>table</code> tag.</li>
* <li>if previous open tag is one of <code>tr</code>, <code>caption</code> or <code>colgroup</code>, it will be implicitly closed.</li>
* </ul>
* </p>
*/
public class TagInfo {
public String getAssumedNamespace() {
return assumedNamespace;
}
public void setAssumedNamespace(String assumedNamespace) {
this.assumedNamespace = assumedNamespace;
}
public String getAssumedNamespacePrefix() {
return assumedNamespacePrefix;
}
public void setAssumedNamespacePrefix(String assumedNamespacePrefix) {
this.assumedNamespacePrefix = assumedNamespacePrefix;
}
private String name;
private ContentType contentType;
private Set<String> mustCloseTags = new HashSet<String>();
private Set<String> higherTags = new HashSet<String>();
private Set<String> childTags = new HashSet<String>();
private Set<String> permittedTags = new HashSet<String>();
private Set<String> copyTags = new HashSet<String>();
private Set<String> continueAfterTags = new HashSet<String>();
private BelongsTo belongsTo = BelongsTo.BODY;
private Set<String>requiredParentTags = new HashSet<String>();
private Set<String>fatalTags = new HashSet<String>();
private String preferredChildTag = null;
private String assumedNamespace = null;
private String assumedNamespacePrefix = null;
private boolean deprecated;
private boolean unique;
private CloseTag closeTag;
private Display display;
public TagInfo(String name, ContentType contentType, BelongsTo belongsTo, boolean deprecated, boolean unique, boolean ignorePermitted, CloseTag closeTag, Display display) {
this.name = name;
this.contentType = contentType;
this.belongsTo = belongsTo;
this.deprecated = deprecated;
this.unique = unique;
this.closeTag = closeTag;
this.display = display;
}
public void defineFatalTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.fatalTags.add(currTag);
this.higherTags.add(currTag);
}
}
public void defineRequiredEnclosingTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.requiredParentTags.add(currTag);
this.higherTags.add(currTag);
}
}
public void defineForbiddenTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.permittedTags.add(currTag);
}
}
public void defineAllowedChildrenTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.childTags.add(currTag);
}
}
public void defineHigherLevelTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.higherTags.add(currTag);
}
}
public void defineCloseBeforeCopyInsideTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.copyTags.add(currTag);
this.mustCloseTags.add(currTag);
}
}
public void defineCloseInsideCopyAfterTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.continueAfterTags.add(currTag);
}
}
public void defineCloseBeforeTags(String commaSeparatedListOfTags) {
StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
while (tokenizer.hasMoreTokens()) {
String currTag = tokenizer.nextToken();
this.mustCloseTags.add(currTag);
}
}
// getters and setters
public Display getDisplay() {
return display;
}
public void setDisplay(Display display) {
this.display = display;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public ContentType getContentType() {
return contentType;
}
public Set<String> getMustCloseTags() {
return mustCloseTags;
}
public void setMustCloseTags(Set<String> mustCloseTags) {
this.mustCloseTags = mustCloseTags;
}
public Set<String> getHigherTags() {
return higherTags;
}
public void setHigherTags(Set<String> higherTags) {
this.higherTags = higherTags;
}
public Set<String> getChildTags() {
return childTags;
}
public void setChildTags(Set<String> childTags) {
this.childTags = childTags;
}
public Set<String> getPermittedTags() {
return permittedTags;
}
public void setPermittedTags(Set<String> permittedTags) {
this.permittedTags = permittedTags;
}
public Set<String> getCopyTags() {
return copyTags;
}
public void setCopyTags(Set<String> copyTags) {
this.copyTags = copyTags;
}
public Set<String> getContinueAfterTags() {
return continueAfterTags;
}
public void setContinueAfterTags(Set<String> continueAfterTags) {
this.continueAfterTags = continueAfterTags;
}
public Set<String> getRequiredParentTags() {
return requiredParentTags;
}
public void setRequiredParent(String requiredParent) {
this.requiredParentTags.add(requiredParent);
}
public BelongsTo getBelongsTo() {
return belongsTo;
}
public void setBelongsTo(BelongsTo belongsTo) {
this.belongsTo = belongsTo;
}
public Set<String> getFatalTags(){
return this.fatalTags;
}
public boolean isFatalTag(String tag){
for (String fatalTag:this.fatalTags){
if (tag.equals(fatalTag)) return true;
}
return false;
}
public void setFatalTag(String fatalTag) {
this.fatalTags.add(fatalTag);
}
public boolean isDeprecated() {
return deprecated;
}
public void setDeprecated(boolean deprecated) {
this.deprecated = deprecated;
}
public boolean isUnique() {
return unique;
}
public void setUnique(boolean unique) {
this.unique = unique;
}
public boolean isEmptyTag() {
return ContentType.none == contentType;
}
// other functionality
boolean allowsBody() {
return ContentType.none != contentType;
}
boolean isHigher(String tagName) {
return higherTags.contains(tagName);
}
boolean isCopy(String tagName) {
return copyTags.contains(tagName);
}
boolean hasCopyTags() {
return !copyTags.isEmpty();
}
boolean isContinueAfter(String tagName) {
return continueAfterTags.contains(tagName);
}
boolean hasPermittedTags() {
return !permittedTags.isEmpty();
}
boolean isHeadTag() {
return belongsTo == BelongsTo.HEAD;
}
boolean isHeadAndBodyTag() {
return belongsTo == BelongsTo.HEAD || belongsTo == BelongsTo.HEAD_AND_BODY;
}
boolean isMustCloseTag(TagInfo tagInfo) {
if (tagInfo != null) {
return mustCloseTags.contains( tagInfo.getName() ) || tagInfo.contentType == ContentType.text;
}
return false;
}
/**
*
* @param token
* @return true if the passed token is allowed to be nested in a Tag with this TagInfo.
*/
boolean allowsItem(BaseToken token) {
if ( contentType != ContentType.none && token instanceof TagToken ) {
TagToken tagToken = (TagToken) token;
String tagName = tagToken.getName();
if ( "script".equals(tagName) ) {
return true;
}
}
switch (contentType) {
case all:
if ( !childTags.isEmpty() ) {
if ( token instanceof TagToken) {
return childTags.contains( ((TagToken)token).getName() );
}
} else if ( !permittedTags.isEmpty() ) {
if ( token instanceof TagToken) {
return !permittedTags.contains( ((TagToken)token).getName() );
}
}
return true;
case text:
return !(token instanceof TagToken);
case none:
if ( token instanceof ContentNode ) {
// allow white space in outputed html
return ( (ContentNode)token).isBlank();
} else if (!(token instanceof TagToken)) {
// allow directives.
return true;
}
default:
return false;
}
}
boolean allowsAnything() {
return ContentType.all == contentType && childTags.isEmpty();
}
/**
* @return True if the tag can be minimized
*/
public boolean isMinimizedTagPermitted() {
return this.closeTag.isMinimizedTagPermitted();
}
public String getPreferredChildTag() {
return preferredChildTag;
}
public void setPreferredChildTag(String preferredChildTag) {
this.preferredChildTag = preferredChildTag;
}
}
+889
View File
@@ -0,0 +1,889 @@
/* Copyright (c) 2006-2014, HTMLCleaner project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
http://htmlcleaner.sourceforge.net/
*/
package org.htmlcleaner;
import java.io.IOException;
import java.io.Writer;
import java.util.*;
import java.util.Map.Entry;
import org.htmlcleaner.conditional.ITagNodeCondition;
import org.htmlcleaner.conditional.TagAllCondition;
import org.htmlcleaner.conditional.TagNodeAttExistsCondition;
import org.htmlcleaner.conditional.TagNodeAttValueCondition;
import org.htmlcleaner.conditional.TagNodeNameCondition;
/**
* <p>
* XML node tag - basic node of the cleaned HTML tree. At the same time, it represents start tag token
* after HTML parsing phase and before cleaning phase. After cleaning process, tree structure remains
* containing tag nodes (TagNode class), content (text nodes - ContentNode), comments (CommentNode)
* and optionally doctype node (DoctypeToken).
* </p>
*/
public class TagNode extends TagToken implements HtmlNode {
private final LinkedHashMap<String, String> attributes = new LinkedHashMap<String, String>();
private final List<BaseToken> children = new ArrayList<BaseToken>();
private DoctypeToken docType;
private List<BaseToken> itemsToMove;
private Map<String, String> nsDeclarations;
private transient boolean isFormed;
/**
* Used to indicate a start tag that was auto generated because {@link TagInfo#isContinueAfter(String)}(closedTag.getName()) returned true
* For example,
* <pre>
* <b><i>foo</b>bar
* </pre>
* would result in a new <i> being created resulting in
* <pre>
* <b><i>foo</i></b><i>bar</i>
* </pre>
* The second opening <i> tag is marked as autogenerated. This allows the autogenerated tag to be removed if it is unneeded.
*/
private boolean autoGenerated;
/**
* This flag is set if we are using namespace aware setting, and the tagnode belongs
* to a non-HTML namespace.
*/
private boolean isForeignMarkup;
/**
* This flag is set if foreignMarkup is set; if it is false it means that the tagnode tree has not been built and so
* it isn't known whether this node is a HTML node or foreign markup such as SVG.
*/
private boolean foreignMarkupFlagSet = false;
/**
* This flag is set if attribute values should be trimmed.
*/
private boolean isTrimAttributeValues = true;
/**
* Indicates that the node was marked to be pruned out of the tree.
*/
private boolean pruned;
/**
* Indicates that the node is a copy of another node.
* @see #makeCopy()
*/
private final boolean isCopy;
public TagNode(String name) {
this(name, false);
}
private TagNode(String name, boolean isCopy) {
super(name);
this.isCopy = isCopy;
}
/* (non-Javadoc)
* @see org.htmlcleaner.TagToken#getName()
*/
@Override
public String getName() {
//
// If this is foreign markup (e.g. SVG) we return the
// original name, otherwise we return it in lower case
//
if (this.isForeignMarkup){
return name;
} else {
return name == null ? null: name.toLowerCase();
}
}
/**
* @param attName
* @return Value of the specified attribute, or null if it this tag doesn't contain it.
*/
public String getAttributeByName(String attName) {
if (attName == null) return null;
//
// We have to do case-insensitive comparisons
//
return attName != null ? (String) getAttributesInLowerCase().get(attName.toLowerCase()) : null;
}
/**
* Returns the attributes of the tagnode.
*
* @return Map instance containing all attribute name/value pairs.
*/
public Map<String, String> getAttributes() {
return new LinkedHashMap<String, String>(this.attributes);
}
/**
* Returns the attributes of the tagnode in lower case.
*
* @return Map instance containing all attribute name/value pairs, with attribute names transformed to lower case
*/
public Map<String, String> getAttributesInLowerCase(){
return attributesToLowerCase();
}
/**
* Replace the current set of attributes with a new set.
* @param attributes
*/
public void setAttributes(Map<String, String> attributes) {
//
// If we haven't yet built the tree, we don't know if this
// element is "foreign markup". In this case we don't want
// to overwrite attributes with the same version with a lower
// cased name when its set by the transforms processor.
//
//
// We're calling this method after the tree has been built,
// so its safe to just set the attributes
//
if (foreignMarkupFlagSet){
replaceAttributes(attributes);
} else {
//
// The foreign markup flag hasn't been set, so instead of just
// replacing the contents of the attributes map, we iterate
// over it and use the original case name from the existing
// attributes map where it exists
//
//
// First create a map to hold the processed map contents
//
LinkedHashMap<String, String> processedAttributes = new LinkedHashMap<String, String>();
//
// Iterate over the keys in the map provided by the transforms processor
// and add them to the set of processed keys
//
for (Map.Entry<String, String> entry : attributes.entrySet()){
String key = entry.getKey();
if (Thread.currentThread().isInterrupted()) {
// Interruption: if the attributes.keySet() is large this loop will take a lot of time
handleInterruption();
return;
}
String keyToSet = key; // the key to set
String value = attributes.get(key); // the value to set
//value = Utils.deserializeEntities(value, true);
//
// Check to see if the key exists in the current attribute set
// with different casing. If so, we keep the casing
//
if (!foreignMarkupFlagSet){
for (String existingKey: this.attributes.keySet()){
if (existingKey.equalsIgnoreCase(key)){
keyToSet = existingKey;
}
}
}
//
// If we have duplicates, keep the first value
//
if (!processedAttributes.containsKey(keyToSet)){
processedAttributes.put(keyToSet, value);
}
}
replaceAttributes(processedAttributes);
}
}
/**
*
* Clears existing attributes and puts replacement attributes
* @param attributes the attributes to set
*/
private void replaceAttributes(Map<String, String> attributes){
this.attributes.clear();
this.attributes.putAll(attributes);
}
/**
* Checks existence of specified attribute.
*
* @param attName
* @return true if TagNode has attribute
*/
public boolean hasAttribute(String attName) {
if (attName == null) return false;
//
// We have to do case-insensitive comparisons
//
for (String key: attributes.keySet()){
if (key.equalsIgnoreCase(attName)) return true;
}
return false;
}
/**
* Adds specified attribute to this tag or overrides existing one.
*
* @param attName
* @param attValue
*/
@Override
public void addAttribute(String attName, String attValue) {
if (attName != null) {
String trim = attName.trim();
if (!isForeignMarkup && foreignMarkupFlagSet) trim = trim.toLowerCase();
String value = attValue == null ? "" : attValue;
if (isTrimAttributeValues) value = value.trim().replaceAll("\\p{Cntrl}", " ");
if (trim.length() != 0) {
//
// If there is already an entry, keep the existing value rather than
// overwrite it.
//
if (!attributes.containsKey(trim)){
attributes.put(trim, value);
}
}
}
}
/**
* Removes specified attribute from this tag.
*
* @param attName
*/
public void removeAttribute(String attName) {
if (attName != null && !"".equals(attName.trim())) {
attributes.remove(attName.toLowerCase());
}
}
/**
* @return List of child TagNode objects.
* @deprecated use {@link TagNode#getChildTagList()}, will be refactored and possibly removed in
* future versions. TODO This method should be refactored because is does not
* properly match the commonly used Java's getter/setter strategy.
*/
@Deprecated
public List<TagNode> getChildren() {
return getChildTagList();
}
public void setChildren(List<? extends BaseToken> children) {
this.children.clear();
this.children.addAll(children);
}
public List<? extends BaseToken> getAllChildren() {
return children;
}
/**
* @return List of child TagNode objects.
*/
public List<TagNode> getChildTagList() {
List<TagNode> childTagList = new ArrayList<TagNode>();
for (Object item: children) {
if (item instanceof TagNode) {
childTagList.add((TagNode) item);
}
}
return childTagList;
}
/**
* @return Whether this node has child elements or not.
*/
public boolean hasChildren() {
return !children.isEmpty();
}
/**
* @return An array of child TagNode instances.
*/
public TagNode[] getChildTags() {
List<TagNode> childTagList = getChildTagList();
TagNode childrenArray[] = new TagNode[childTagList.size()];
for (int i = 0; i < childTagList.size(); i++) {
childrenArray[i] = (TagNode) childTagList.get(i);
}
return childrenArray;
}
/**
* @return Text content of this node and it's subelements.
*/
public CharSequence getText() {
StringBuilder text = new StringBuilder();
for (Object item :children) {
if (item instanceof ContentNode) {
text.append(((ContentNode) item).getContent());
} else if (item instanceof TagNode) {
CharSequence subtext = ((TagNode) item).getText();
text.append(subtext);
}
}
return text;
}
/**
* @param child Child to find index of
* @return Index of the specified child node inside this node's children, -1 if node is not the
* child
*/
public int getChildIndex(HtmlNode child) {
int index = 0;
for (Object curr : children) {
if (curr == child) {
return index;
}
index++;
}
return -1;
}
/**
* Inserts specified node at specified position in array of children
*
* @param index
* @param childToAdd
*/
public void insertChild(int index, HtmlNode childToAdd) {
children.add(index, childToAdd);
}
/**
* Inserts specified node in the list of children before specified child
*
* @param node Child before which to insert new node
* @param nodeToInsert Node to be inserted at specified position
*/
public void insertChildBefore(HtmlNode node, HtmlNode nodeToInsert) {
int index = getChildIndex(node);
if (index >= 0) {
insertChild(index, nodeToInsert);
}
}
/**
* Inserts specified node in the list of children after specified child
*
* @param node Child after which to insert new node
* @param nodeToInsert Node to be inserted at specified position
*/
public void insertChildAfter(HtmlNode node, HtmlNode nodeToInsert) {
int index = getChildIndex(node);
if (index >= 0) {
insertChild(index + 1, nodeToInsert);
}
}
public DoctypeToken getDocType() {
return docType;
}
public void setDocType(DoctypeToken docType) {
this.docType = docType;
}
public void addChild(Object child) {
if (child == null) {
return;
}
if (child instanceof List) {
addChildren((List) child);
} else if (child instanceof ProxyTagNode) {
children.add(((ProxyTagNode) child).getToken());
} else if (child instanceof BaseToken){
children.add((BaseToken)child);
if (child instanceof TagNode) {
TagNode childTagNode = (TagNode) child;
childTagNode.parent = this;
}
} else {
throw new RuntimeException("Attempted to add invalid child object to TagNode; class="+child.getClass());
}
}
/**
* Add all elements from specified list to this node.
*
* @param newChildren
*/
public void addChildren(List newChildren) {
if (newChildren != null) {
for (Object child: newChildren) {
addChild(child);
}
}
}
/**
* Finds first element in the tree that satisfy specified condition.
*
* @param condition
* @param isRecursive
* @return First TagNode found, or null if no such elements.
*/
private TagNode findElement(ITagNodeCondition condition, boolean isRecursive) {
if (condition != null) {
for (Object item : children) {
if (item instanceof TagNode) {
TagNode currNode = (TagNode) item;
if (condition.satisfy(currNode)) {
return currNode;
} else if (isRecursive) {
TagNode inner = currNode.findElement(condition, isRecursive);
if (inner != null) {
return inner;
}
}
}
}
}
return null;
}
/**
* Get all elements in the tree that satisfy specified condition.
* @param condition
* @param isRecursive
* @return List of TagNode instances.
*/
private List<TagNode> findMatchingTagNodes(ITagNodeCondition condition, boolean isRecursive){
List<TagNode> result = new LinkedList<TagNode>();
if (condition == null) {
return result;
}
for (Object item : children) {
if (item instanceof TagNode) {
TagNode currNode = (TagNode) item;
if (condition.satisfy(currNode)) {
result.add(currNode);
}
if (isRecursive) {
List<TagNode> innerList = currNode.findMatchingTagNodes(condition, isRecursive);
if (innerList != null && innerList.size() > 0) {
result.addAll(innerList);
}
}
}
}
return result;
}
/**
* Get all elements in the tree that satisfy specified condition.
*
* @param condition
* @param isRecursive
* @return List of TagNode instances with specified name.
*/
public List<? extends TagNode> getElementList(ITagNodeCondition condition, boolean isRecursive) {
return findMatchingTagNodes(condition, isRecursive);
}
/**
* @param condition
* @param isRecursive
* @return The array of all subelements that satisfy specified condition.
*/
private TagNode[] getElements(ITagNodeCondition condition, boolean isRecursive) {
final List<TagNode> list = findMatchingTagNodes(condition, isRecursive);
TagNode array[];
if (list == null) {
array = new TagNode[0];
} else {
array = (TagNode[]) list.toArray(new TagNode[list.size()]);
}
return array;
}
public List<? extends TagNode> getAllElementsList(boolean isRecursive) {
return getElementList(new TagAllCondition(), isRecursive);
}
public TagNode[] getAllElements(boolean isRecursive) {
return getElements(new TagAllCondition(), isRecursive);
}
public TagNode findElementByName(String findName, boolean isRecursive) {
return findElement(new TagNodeNameCondition(findName), isRecursive);
}
public List<? extends TagNode> getElementListByName(String findName, boolean isRecursive) {
return getElementList(new TagNodeNameCondition(findName), isRecursive);
}
public TagNode[] getElementsByName(String findName, boolean isRecursive) {
return getElements(new TagNodeNameCondition(findName), isRecursive);
}
public TagNode findElementHavingAttribute(String attName, boolean isRecursive) {
return findElement(new TagNodeAttExistsCondition(attName), isRecursive);
}
public List<? extends TagNode> getElementListHavingAttribute(String attName, boolean isRecursive) {
return getElementList(new TagNodeAttExistsCondition(attName), isRecursive);
}
public TagNode[] getElementsHavingAttribute(String attName, boolean isRecursive) {
return getElements(new TagNodeAttExistsCondition(attName), isRecursive);
}
public TagNode findElementByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
return findElement(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
}
public List<? extends TagNode> getElementListByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
return getElementList(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
}
public TagNode[] getElementsByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
return getElements(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
}
/**
* Evaluates XPath expression on give node. <br>
* <em>
* This is not fully supported XPath parser and evaluator.
* Examples below show supported elements:
* </em> <code>
* <ul>
* <li>//div//a</li>
* <li>//div//a[@id][@class]</li>
* <li>/body/*[1]/@type</li>
* <li>//div[3]//a[@id][@href='r/n4']</li>
* <li>//div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a</li>
* <li>//div[2]/@*[2]</li>
* <li>data(//div//a[@id][@class])</li>
* <li>//p/last()</li>
* <li>//body//div[3][@class]//span[12.2<position()]/@id</li>
* <li>data(//a['v' < @id])</li>
* </ul>
* </code>
*
* @param xPathExpression
* @return result of XPather evaluation.
* @throws XPatherException
*/
public Object[] evaluateXPath(String xPathExpression) throws XPatherException {
return new XPather(xPathExpression).evaluateAgainstNode(this);
}
/**
* Remove this node from the tree.
*
* @return True if element is removed (if it is not root node).
*/
public boolean removeFromTree() {
return parent != null ? parent.removeChild(this) : false;
}
/**
* Remove specified child element from this node.
*
* @param child
* @return True if child object existed in the children list.
*/
public boolean removeChild(Object child) {
return this.children.remove(child);
}
/**
* Removes all children (subelements and text content).
*/
public void removeAllChildren() {
this.children.clear();
}
void addItemForMoving(Object item) {
if (itemsToMove == null) {
itemsToMove = new ArrayList<BaseToken>();
}
if (item instanceof BaseToken){
itemsToMove.add((BaseToken)item);
} else {
throw new RuntimeException("Attempt to add invalid item for moving; class="+item.getClass());
}
}
List<? extends BaseToken> getItemsToMove() {
return itemsToMove;
}
void setItemsToMove(List<BaseToken> itemsToMove) {
this.itemsToMove = itemsToMove;
}
boolean isFormed() {
return isFormed;
}
void setFormed(boolean isFormed) {
this.isFormed = isFormed;
}
void setFormed() {
setFormed(true);
}
/**
* @param autoGenerated the autoGenerated to set
*/
public void setAutoGenerated(boolean autoGenerated) {
this.autoGenerated = autoGenerated;
}
/**
* @return the autoGenerated
*/
public boolean isAutoGenerated() {
return autoGenerated;
}
/**
* @return true, if node was marked to be pruned.
*/
public boolean isPruned() {
return pruned;
}
public void setPruned(boolean pruned) {
this.pruned = pruned;
}
public boolean isEmpty() {
if (!isPruned()) {
for (Object child : this.children) {
if (child instanceof TagNode) {
if (!((TagNode) child).isPruned()) {
return false;
}
} else if (child instanceof ContentNode) {
if (!((ContentNode) child).isBlank()) {
return false;
}
} else if (child instanceof CommentNode) {
// ideally could be discarded - however standard practice is to include browser specific commands in comments. :-(
return false;
} else {
return false;
}
}
}
return true;
}
/**
* Adds namespace declaration to the node
*
* @param nsPrefix Namespace prefix
* @param nsURI Namespace URI
*/
public void addNamespaceDeclaration(String nsPrefix, String nsURI) {
if (nsDeclarations == null) {
nsDeclarations = new TreeMap<String, String>();
}
nsDeclarations.put(nsPrefix, nsURI);
}
/**
* Collect all prefixes in namespace declarations up the path to the document root from the
* specified node
*
* @param prefixes Set of prefixes to be collected
*/
void collectNamespacePrefixesOnPath(Set<String> prefixes) {
Map<String, String> nsDeclarations = getNamespaceDeclarations();
if (nsDeclarations != null) {
for (String prefix : nsDeclarations.keySet()) {
prefixes.add(prefix);
}
}
if (parent != null) {
parent.collectNamespacePrefixesOnPath(prefixes);
}
}
String getNamespaceURIOnPath(String nsPrefix) {
if (nsDeclarations != null) {
for (Map.Entry<String, String> nsEntry : nsDeclarations.entrySet()) {
String currName = nsEntry.getKey();
if (currName.equals(nsPrefix) || ("".equals(currName) && nsPrefix == null)) {
return nsEntry.getValue();
}
}
}
if (parent != null) {
return parent.getNamespaceURIOnPath(nsPrefix);
}
return null;
}
/**
* @return Map of namespace declarations for this node
*/
public Map<String, String> getNamespaceDeclarations() {
return nsDeclarations;
}
public void serialize(Serializer serializer, Writer writer) throws IOException {
serializer.serialize(this, writer);
}
public TagNode makeCopy() {
TagNode copy = new TagNode(name, true);
copy.attributes.putAll(attributes);
return copy;
}
public boolean isCopy() {
return isCopy;
}
/**
* Traverses the tree and performs visitor's action on each node. It stops when it finishes all
* the tree or when visitor returns false.
*
* @param visitor TagNodeVisitor implementation
*/
public void traverse(TagNodeVisitor visitor) {
traverseInternally(visitor);
}
private boolean traverseInternally(TagNodeVisitor visitor) {
if (visitor != null) {
boolean hasParent = parent != null;
boolean toContinue = visitor.visit(parent, this);
if (!toContinue) {
return false; // if visitor stops traversal
} else if (hasParent && parent == null) {
return true; // if this node is pruned from the tree during the visit, then don't go deeper
}
for (Object child : children.toArray()) { // make an array to avoid ConcurrentModificationException when some node is cut
if (child instanceof TagNode) {
toContinue = ((TagNode) child).traverseInternally(visitor);
} else if (child instanceof ContentNode) {
toContinue = visitor.visit(this, (ContentNode) child);
} else if (child instanceof CommentNode) {
toContinue = visitor.visit(this, (CommentNode) child);
}
if (!toContinue) {
return false;
}
}
}
return true;
}
/**
* @return the isForeignMarkup
*/
public boolean isForeignMarkup() {
return isForeignMarkup;
}
/**
* @param isForeignMarkup the isForeignMarkup to set
*/
public void setForeignMarkup(boolean isForeignMarkup) {
foreignMarkupFlagSet = true;
this.isForeignMarkup = isForeignMarkup;
//
// if set to false, change all existing attributes of this
// element to lowercase.
//
if (!isForeignMarkup){
this.replaceAttributes(getAttributesInLowerCase());
}
}
/**
* @return the isTrimAttributeValues
*/
public boolean isTrimAttributeValues() {
return isTrimAttributeValues;
}
/**
* @param isTrimAttributeValues the isTrimAttributeValues to set
*/
public void setTrimAttributeValues(boolean isTrimAttributeValues) {
this.isTrimAttributeValues = isTrimAttributeValues;
}
/**
* Returns a copy of the set of attributes for this node with lowercase
* names. Where there are duplicate attributes (e.g. class, CLASS) the first
* value is retained.
* @return a map of attributes in key/value pairs with names in lowercase
*/
private Map<String, String> attributesToLowerCase(){
Map<String, String> lowerCaseAttributes = new LinkedHashMap<String, String>();
for (Entry<String, String> entry: attributes.entrySet()){
String key = entry.getKey();
if (!lowerCaseAttributes.containsKey(key.toLowerCase())){
lowerCaseAttributes.put(key.toLowerCase(), attributes.get(key));
}
}
return lowerCaseAttributes;
}
/**
* Called whenver the thread is interrupted. Currently this is a
* placeholder, but could hold cleanup methods and user interaction
*/
private void handleInterruption(){
}
}
@@ -0,0 +1,16 @@
package org.htmlcleaner;
/**
* Defines action to be performed on TagNodes
*/
public interface TagNodeVisitor {
/**
* Action to be performed on single node in the tree
* @param parentNode Parent of tagNode
* @param htmlNode node visited
* @return True if tree traversal should be continued, false if it has to stop.
*/
public boolean visit(TagNode parentNode, HtmlNode htmlNode);
}
+18
View File
@@ -0,0 +1,18 @@
package org.htmlcleaner;
/**
* Contains information about a single open tag
*/
class TagPos {
int position;
String name;
TagInfo info;
TagPos(int position, String name, TagInfo tagInfo, CleanTimeValues cleanTimeValues) {
this.position = position;
this.name = name;
this.info = tagInfo;
}
}
@@ -0,0 +1,66 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* <p>HTML tag token - descendants are start (TagNode) and end token (EndTagToken).</p>
*/
public abstract class TagToken extends BaseHtmlNode {
protected String name;
public TagToken() {
}
public TagToken(String name) {
this.name = name;
}
public String getName() {
return name;
}
@Override
public String toString() {
return name;
}
abstract void addAttribute(String attName, String attValue);
}
@@ -0,0 +1,231 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.LinkedHashMap;
import java.util.regex.Pattern;
/**
* Describes how specified tag is transformed to another one, or is ignored during parsing
*/
public class TagTransformation {
public static String VAR_START = "${";
public static String VAR_END = "}";
private String sourceTag;
private String destTag;
private boolean preserveSourceAttributes;
private Map<String, String> attributeTransformations = new LinkedHashMap<String, String>();
private List<AttributeTransformation> attributePatternTransformations = new ArrayList<AttributeTransformation>();
public TagTransformation() {
this.preserveSourceAttributes = true;
}
/**
* Creates new tag transformation from source tag to target tag specifying whether
* source tag attributes are preserved.
* @param sourceTag Name of the tag to be transformed.
* @param destTag Name of tag to which source tag is to be transformed.
* @param preserveSourceAttributes Tells whether source tag attributes are preserved in transformation.
*/
public TagTransformation(String sourceTag, String destTag, boolean preserveSourceAttributes) {
this.sourceTag = sourceTag.toLowerCase();
if (destTag == null) {
this.destTag = null;
} else {
this.destTag = Utils.isValidXmlIdentifier(destTag) ? destTag.toLowerCase() : sourceTag;
}
this.preserveSourceAttributes = preserveSourceAttributes;
}
/**
* Creates new tag transformation from source tag to target tag preserving
* all source tag attributes.
* @param sourceTag Name of the tag to be transformed.
* @param destTag Name of tag to which source tag is to be transformed.
*/
public TagTransformation(String sourceTag, String destTag) {
this(sourceTag, destTag, true);
}
/**
* Creates new tag transformation in which specified tag will be skipped (ignored)
* during parsing process.
* @param sourceTag
*/
public TagTransformation(String sourceTag) {
this(sourceTag, null);
}
/**
* Adds new attribute transformation to this tag transformation. It tells how destination
* attribute will look like. Small templating mechanism is used to describe attribute value:
* all names between ${ and } inside the template are evaluated against source tag attributes.
* That way one can make attribute values consist of mix of source tag attributes.
*
* @param targetAttName Name of the destination attribute
* @param transformationDesc Template describing attribute value.
*/
public void addAttributeTransformation(String targetAttName, String transformationDesc) {
attributeTransformations.put(targetAttName.toLowerCase(), transformationDesc);
}
public void addAttributePatternTransformation(Pattern attNamePattern, String transformationDesc) {
attributePatternTransformations.add(new AttributeTransformationPatternImpl(attNamePattern, null, transformationDesc));
}
public void addAttributePatternTransformation(Pattern attNamePattern, Pattern attValuePattern, String transformationDesc) {
addAttributePatternTransformation(new AttributeTransformationPatternImpl(attNamePattern, attValuePattern, transformationDesc));
}
/**
* @param attributeTransformation
*/
public void addAttributePatternTransformation(AttributeTransformation attributeTransformation) {
if (attributePatternTransformations == null) {
attributePatternTransformations = new ArrayList<AttributeTransformation>();
}
attributePatternTransformations.add(attributeTransformation);
}
/**
* Adds new attribute transformation in which destination attrbute will not exists
* (simply removes it from list of attributes).
* @param targetAttName
*/
public void addAttributeTransformation(String targetAttName) {
addAttributeTransformation(targetAttName, null);
}
boolean hasAttributeTransformations() {
return attributeTransformations != null || attributePatternTransformations != null;
}
String getSourceTag() {
return sourceTag;
}
String getDestTag() {
return destTag;
}
boolean isPreserveSourceAttributes() {
return preserveSourceAttributes;
}
Map<String, String> getAttributeTransformations() {
return attributeTransformations;
}
/**
* @param attributes
*/
public Map<String, String> applyTagTransformations(Map<String, String> attributes) {
boolean isPreserveSourceAtts = isPreserveSourceAttributes();
boolean hasAttTransforms = hasAttributeTransformations();
if ( hasAttTransforms || !isPreserveSourceAtts) {
Map<String, String> newAttributes = isPreserveSourceAtts ? new LinkedHashMap<String, String>(attributes) : new LinkedHashMap<String, String>();
if (hasAttTransforms) {
Map<String, String> map = getAttributeTransformations();
Iterator<Map.Entry<String, String>> iterator = map.entrySet().iterator();
while (iterator.hasNext()) {
Map.Entry<String, String> entry = iterator.next();
String attName = (String) entry.getKey();
String template = (String) entry.getValue();
if (template == null) {
newAttributes.remove(attName);
} else {
String attValue = evaluateTemplate(template, attributes);
newAttributes.put(attName, attValue);
}
}
for(AttributeTransformation attributeTransformation: this.attributePatternTransformations) {
for(Map.Entry<String, String> entry1: attributes.entrySet()) {
String attName = entry1.getKey();
if (attributeTransformation.satisfy(attName, entry1.getValue())) {
String template = attributeTransformation.getTemplate();
if (template == null) {
newAttributes.remove(attName);
} else {
String attValue = evaluateTemplate(template, attributes);
newAttributes.put(attName, attValue);
}
}
}
}
}
return newAttributes;
} else {
return attributes;
}
}
/**
* Evaluates string template for specified map of variables. Template string can contain
* dynamic parts in the form of ${VARNAME}. Each such part is replaced with value of the
* variable if such exists in the map, or with empty string otherwise.
*
* @param template Template string
* @param variables Map of variables (can be null)
* @return Evaluated string
*/
public String evaluateTemplate(String template, Map<String, String> variables) {
if (template == null) {
return template;
}
StringBuffer result = new StringBuffer();
int startIndex = template.indexOf(VAR_START);
int endIndex = -1;
while (startIndex >= 0 && startIndex < template.length()) {
result.append( template.substring(endIndex + 1, startIndex) );
endIndex = template.indexOf(VAR_END, startIndex);
if (endIndex > startIndex) {
String varName = template.substring(startIndex + VAR_START.length(), endIndex);
Object resultObj = variables != null ? variables.get(varName.toLowerCase()) : "";
result.append( resultObj == null ? "" : resultObj.toString() );
}
startIndex = template.indexOf( VAR_START, Math.max(endIndex + VAR_END.length(), startIndex + 1) );
}
result.append( template.substring(endIndex + 1) );
return result.toString();
}
}
@@ -0,0 +1,123 @@
/* Copyright (c) 2006-2019, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import java.io.StringWriter;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
/**
* A traversal-based serializer for DOM; used to avoid recursion and stack overflow for large
* HTML documents.
*/
public class TraversalDomSerializer {
private CleanerProperties props;
/**
* Whether XML entities should be escaped or not.
*/
protected boolean escapeXml = true;
protected boolean deserializeCdataEntities = false;
protected boolean strictErrorChecking = true;
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
* @param escapeXml if true then escape XML entities
* @param deserializeCdataEntities if true then deserialize entities in CData sections
* @param strictErrorChecking if false then Document strict error checking is turned off
*/
public TraversalDomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
this.props = props;
this.escapeXml = escapeXml;
this.deserializeCdataEntities = deserializeCdataEntities;
this.strictErrorChecking = strictErrorChecking;
}
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
* @param escapeXml if true then escape XML entities
* @param deserializeCdataEntities if true then deserialize entities in CData sections
*/
public TraversalDomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities) {
this.props = props;
this.escapeXml = escapeXml;
this.deserializeCdataEntities = deserializeCdataEntities;
}
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
* @param escapeXml if true then escape XML entities
*/
public TraversalDomSerializer(CleanerProperties props, boolean escapeXml) {
this.props = props;
this.escapeXml = escapeXml;
}
/**
* @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
*/
public TraversalDomSerializer(CleanerProperties props) {
this.props = props;
}
/**
* @param rootNode the HTML Cleaner root node to serialize
* @return the W3C Document object
* @throws ParserConfigurationException if there's an error during serialization
*/
public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
DomBuilder builder = new DomBuilder(props, escapeXml, deserializeCdataEntities, strictErrorChecking);
XmlTraversor.traverse(builder, rootNode);
return builder.getDocument();
}
public static String toString(Document doc) throws TransformerException, ParserConfigurationException{
DOMSource domSource = new DOMSource(doc);
StringWriter writer = new StringWriter();
StreamResult result = new StreamResult(writer);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
return writer.toString();
}
}
+907
View File
@@ -0,0 +1,907 @@
/* Copyright (c) 2006-2019, the HtmlCleaner project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import java.io.*;
import java.net.URL;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* <p>Common utilities.</p>
*
* Created by: Vladimir Nikic<br/>
* Date: November, 2006.
*/
public class Utils {
static final String VALID_XML_IDENTIFIER_START_CHAR_REGEX = "^[:A-Z_a-z\\u00C0\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02ff\\u0370-\\u037d"
+ "\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f\\u2c00-\\u2fef\\u3001-\\ud7ff"
+ "\\uf900-\\ufdcf\\ufdf0-\\ufffd\\x{10000}-\\x{EFFFF}]";
static final Pattern VALID_XML_IDENTIFIER_START_CHAR_PATTERN =
compileUnicodePattern(VALID_XML_IDENTIFIER_START_CHAR_REGEX);
/*
The relevant production from the spec is http://www.w3.org/TR/xml/#NT-Name
Name ::== NameStartChar NameChar *
NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
*/
static final String VALID_XML_IDENTIFIER_CHAR_REGEX =
"^[:A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02ff\\u0370-\\u037d"
+ "\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f\\u2c00-\\u2fef\\u3001-\\ud7ff"
+ "\\uf900-\\ufdcf\\ufdf0-\\ufffd\\x{10000}-\\x{EFFFF}]"
+ "[:A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6"
+ "\\u00F8-\\u02ff\\u0370-\\u037d\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f"
+ "\\u2c00-\\u2fef\\u3001-\\udfff\\uf900-\\ufdcf\\ufdf0-\\ufffd\\-\\.0-9"
+ "\\u00b7\\u0300-\\u036f\\u203f-\\u2040]*\\Z";
static final Pattern VALID_XML_IDENTIFIER_CHAR_PATTERN =
compileUnicodePattern(VALID_XML_IDENTIFIER_CHAR_REGEX);
/**
* Removes the first newline and last newline (if present) of a string
* @param str
* @return
*/
static String bchomp(final String str){
return chomp(lchomp(str));
}
/**
* Removes the last newline (if present) of a string
* @param str
* @return
*/
static String chomp(final String str){
if (str.length() ==0) {
return str;
}
if (str.length() == 1) {
final char ch = str.charAt(0);
if (ch == '\r' || ch == '\n') {
return "";
}
return str;
}
int lastIdx = str.length() - 1;
final char last = str.charAt(lastIdx);
if (last == '\n') {
if (str.charAt(lastIdx - 1) == '\r') {
lastIdx--;
}
} else if (last != '\r') {
lastIdx++;
}
return str.substring(0, lastIdx);
}
/**
* Removes the first newline (if present) of a string
* @param str
* @return
*/
static String lchomp(final String str){
if (str == null) return null;
if (str.length() == 0) {
return str;
}
if (str.length() == 1) {
final char ch = str.charAt(0);
if (ch == '\r' || ch == '\n') {
return "";
}
return str;
}
int firstIndex = 0;
final char first = str.charAt(0);
if (first == '\n'){
firstIndex++;
if (str.charAt(1) == '\r') {
firstIndex++ ;
}
} else if (first != '\r') {
firstIndex = 0;
}
return str.substring(firstIndex, str.length());
}
/**
* Reads content from the specified URL with specified charset into string
* @param url
* @param charset
* @throws IOException
*/
@Deprecated // Removing network I/O will make htmlcleaner better suited to a server environment which needs managed connections
static CharSequence readUrl(URL url, String charset) throws IOException {
StringBuilder buffer = new StringBuilder(1024);
InputStream inputStream = url.openStream();
try {
InputStreamReader reader = new InputStreamReader(inputStream, charset);
char[] charArray = new char[1024];
int charsRead = 0;
do {
charsRead = reader.read(charArray);
if (charsRead >= 0) {
buffer.append(charArray, 0, charsRead);
}
} while (charsRead > 0);
} finally {
inputStream.close();
}
return buffer;
}
/**
* Checks if specified link is full URL.
*
* @param link
* @return True, if full URl, false otherwise.
*/
public static boolean isFullUrl(String link) {
if (link == null) {
return false;
}
link = link.trim().toLowerCase();
return link.startsWith("http://") || link.startsWith("https://") || link.startsWith("file://");
}
/**
* Calculates full URL for specified page URL and link
* which could be full, absolute or relative like there can
* be found in A or IMG tags. (Reinstated as per user request in bug 159)
*/
public static String fullUrl(String pageUrl, String link) {
if (isFullUrl(link)) {
return link;
} else if (link != null && link.startsWith("?")) {
int qindex = pageUrl.indexOf('?');
int len = pageUrl.length();
if (qindex < 0) {
return pageUrl + link;
} else if (qindex == len - 1) {
return pageUrl.substring(0, len - 1) + link;
} else {
return pageUrl + "&" + link.substring(1);
}
}
boolean isLinkAbsolute = link.startsWith("/");
if (!isFullUrl(pageUrl)) {
pageUrl = "http://" + pageUrl;
}
int slashIndex = isLinkAbsolute ? pageUrl.indexOf("/", 8) : pageUrl.lastIndexOf("/");
if (slashIndex <= 8) {
pageUrl += "/";
} else {
pageUrl = pageUrl.substring(0, slashIndex + 1);
}
return isLinkAbsolute ? pageUrl + link.substring(1) : pageUrl + link;
}
/**
* Escapes HTML string
* @param s String to be escaped
* @param props Cleaner properties affects escaping behaviour
* @return the escaped string
*/
public static String escapeHtml(String s, CleanerProperties props) {
boolean advanced = props.isAdvancedXmlEscape();
boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
boolean translateSpecialEntities = props.isTranslateSpecialEntities();
boolean transResCharsToNCR = props.isTransResCharsToNCR();
boolean transSpecialEntitiesToNCR = props.isTransSpecialEntitiesToNCR();
return escapeXml(s, advanced, recognizeUnicodeChars, translateSpecialEntities, false, transResCharsToNCR, transSpecialEntitiesToNCR, true);
}
/**
* Escapes XML string.
* @param s String to be escaped
* @param props Cleaner properties affects escaping behaviour
* @param isDomCreation Tells if escaped content will be part of the DOM
* @return the escaped string
*/
public static String escapeXml(String s, CleanerProperties props, boolean isDomCreation) {
boolean advanced = props.isAdvancedXmlEscape();
boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
boolean translateSpecialEntities = props.isTranslateSpecialEntities();
boolean transResCharsToNCR = props.isTransResCharsToNCR();
boolean transSpecialEntitiesToNCR = props.isTransSpecialEntitiesToNCR();
return escapeXml(s, advanced, recognizeUnicodeChars, translateSpecialEntities, isDomCreation, transResCharsToNCR, transSpecialEntitiesToNCR, false);
}
/**
* change notes:
* 1) convert ascii characters encoded using &#xx; format to the ascii characters -- may be an attempt to slip in malicious html
* 2) convert &#xxx; format characters to &quot; style representation if available for the character.
* 3) convert html special entities to xml &#xxx; when outputing in xml
* @param s the string to escape
* @param advanced whether to use Advanced XML Escaping
* @param recognizeUnicodeChars whether to recognise and replace Unicode characters
* @param translateSpecialEntities whether to translate special entities
* @param isDomCreation whether the escaping is in the context of DomCreation, an internal operation, with special rules.
* @return the escaped string
* TODO Consider moving to CleanerProperties since a long list of params is misleading.
*/
public static String escapeXml(String s, boolean advanced, boolean recognizeUnicodeChars, boolean translateSpecialEntities,
boolean isDomCreation, boolean transResCharsToNCR, boolean translateSpecialEntitiesToNCR) {
return escapeXml(s,advanced,recognizeUnicodeChars,translateSpecialEntities,isDomCreation,transResCharsToNCR,translateSpecialEntitiesToNCR,false);
}
/**
* change notes:
* 1) convert ascii characters encoded using &#xx; format to the ascii characters -- may be an attempt to slip in malicious html
* 2) convert &#xxx; format characters to &quot; style representation if available for the character.
* 3) convert html special entities to xml &#xxx; when outputing in xml
* @param s the string to escape
* @param advanced whether to use Advanced XML Escaping
* @param recognizeUnicodeChars whether to recognise and replace Unicode characters
* @param translateSpecialEntities whether to translate special entities
* @param isDomCreation whether the escaping is in the context of DomCreation, an internal operation, with special rules.
* @param isHtmlOutput whether the output is intended to be treated as HTML
* @return
* TODO Consider moving to CleanerProperties since a long list of params is misleading.
*/
public static String escapeXml(String s, boolean advanced, boolean recognizeUnicodeChars, boolean translateSpecialEntities,
boolean isDomCreation, boolean transResCharsToNCR, boolean translateSpecialEntitiesToNCR, boolean isHtmlOutput) {
if (s != null) {
int len = s.length();
StringBuilder result = new StringBuilder(len);
for (int i = 0; i < len; i++) {
char ch = s.charAt(i);
SpecialEntity code;
if (ch == '&') {
if ( (advanced || recognizeUnicodeChars) && (i < len-1) && (s.charAt(i+1) == '#') ) {
i = convertToUnicode(s, isDomCreation, recognizeUnicodeChars, translateSpecialEntitiesToNCR, result, i+2);
} else if ((translateSpecialEntities || advanced) &&
(code = SpecialEntities.INSTANCE.getSpecialEntity(s.substring(i, i+Math.min(10, len-i)))) != null) {
if (translateSpecialEntities && code.isHtmlSpecialEntity()) {
if (recognizeUnicodeChars) {
result.append( (char)code.intValue() );
} else {
result.append( code.getDecimalNCR() );
}
i += code.getKey().length() + 1;
} else if (advanced ) {
//
// If we are creating a HTML DOM or outputting to the HtmlSerializer, use HTML special entities;
// otherwise we get their XML escaped version (see bug #118).
//
result.append(transResCharsToNCR ? code.getDecimalNCR() : code.getEscaped(isHtmlOutput || isDomCreation));
i += code.getKey().length()+1;
} else {
result.append(transResCharsToNCR ? getAmpNcr() : "&amp;");
}
}
//
// If the serializer used to output is HTML rather than XML, and we have a match to a
// known HTML entity such as &nbsp;, we output it as-is (see bug #118)
//
else if (isHtmlOutput)
{
// we have an ampersand and that's all we know so far
code = SpecialEntities.INSTANCE.getSpecialEntity(s.substring(i, i+Math.min(10, len-i)));
if ( code != null )
{
// It is a special entity like &nbsp; - leave it in place.
result.append(code.getEscapedValue());
// advance i by the length of the entity so we won't process each following character
// key length excludes & and ; and we add 1 to skip the ;
i += code.getKey().length()+1;
}
else if ( (i < len-1) && (s.charAt(i+1) == '#') )
{
// if the next char is a # then convert entity number to entity name (if possible)
i = convert_To_Entity_Name(s, false, false, false, result, i+2);
// assuming 'i' is being incremented correctly... not verified.
}
else
{
// html output but not an entity name or number
result.append(transResCharsToNCR ? getAmpNcr() : "&amp;");
}
} else {
result.append(transResCharsToNCR ? getAmpNcr() : "&amp;");
}
} else if ((code = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(ch)) != null ) {
// It's a special entity character itself
if ( isHtmlOutput )
{
if ( "apos".equals(code.getKey()) )
{
// leave the apostrophes alone for html output
// this is a cheap hack to avoid removing apostrophe from the special entities list for html output
result.append(ch);
}
else
{
// output as entity name, or as literal character if isDomCreation
result.append(isDomCreation? code.getHtmlString() : code.getEscapedValue());
}
}
else
{
// if we have one of the XML reserved characters, get escaped version, otherwise,
// output the literal characters.
if (isDomCreation && !isXmlReservedCharacter(String.valueOf(ch))){
result.append(ch);
} else {
// output as entity number, or as literal character if isDomCreation
result.append(transResCharsToNCR ? code.getDecimalNCR() : code.getEscaped(isDomCreation));
}
}
} else {
result.append(ch);
}
}
return result.toString();
}
return null;
}
private static String ampNcr;
private static String getAmpNcr() {
if (ampNcr == null) {
ampNcr = SpecialEntities.INSTANCE.getSpecialEntityByUnicode('&').getDecimalNCR();
}
return ampNcr;
}
private static final Pattern ASCII_CHAR = Pattern.compile("\\p{Print}");
/**
* @param s
* @param domCreation
* @param recognizeUnicodeChars
* @param translateSpecialEntitiesToNCR
* @param result
* @param i
* @return
*/
// Converts Numeric Character References (NCRs) (Dec or Hex) to Character Entity References
// ie. &#8364; to &euro;
// This is almost a copy of convertToUnicode
// only called in the case of isHtmlOutput when we see &# in the input stream
private static int convert_To_Entity_Name(String s, boolean domCreation, boolean recognizeUnicodeChars, boolean translateSpecialEntitiesToNCR, StringBuilder result, int i) {
StringBuilder unicode = new StringBuilder();
int charIndex = extractCharCode(s, i, true, unicode);
if (unicode.length() > 0) {
try {
boolean isHex = unicode.substring(0,1).equals("x");
//
// Get the unicode character and code point
//
int codePoint = -1;
char[] unicodeChar = null;
if (isHex){
codePoint = Integer.parseInt(unicode.substring(1), 16);
unicodeChar = Character.toChars(codePoint);
} else {
codePoint = Integer.parseInt(unicode.toString());
unicodeChar = Character.toChars(codePoint);
}
SpecialEntity specialEntity = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(codePoint);
if (unicodeChar.length == 1 && unicodeChar[0] == 0) {
// null character &#0Peanut for example
// just consume character &
result.append("&amp;");
}
else if ( specialEntity != null )
{
if ( specialEntity.isHtmlSpecialEntity() )
{
result.append( domCreation? specialEntity.getHtmlString() : specialEntity.getEscapedValue() );
}
else
{
result.append(domCreation? specialEntity.getHtmlString():
(translateSpecialEntitiesToNCR? (isHex? specialEntity.getHexNCR(): specialEntity.getDecimalNCR()) :
specialEntity.getHtmlString()));
}
} else if ( recognizeUnicodeChars ) {
// output unicode characters as their actual byte code with the exception of characters that have special xml meaning.
result.append( String.valueOf(unicodeChar));
} else if ( ASCII_CHAR.matcher(new String(unicodeChar)).find()) {
// ascii printable character. this fancy escaping might be an attempt to slip in dangerous characters (i.e. spelling out <script> )
// by converting to printable characters we can more easily detect such attacks.
result.append(String.valueOf(unicodeChar));
} else {
// unknown unicode value - output as-is
result.append( "&#").append(unicode).append(";" );
}
} catch (NumberFormatException e) {
// should never happen now
result.append("&amp;#").append(unicode).append(";" );
}
} else {
result.append("&amp;");
}
return charIndex;
}
/**
* @param s
* @param domCreation
* @param recognizeUnicodeChars
* @param translateSpecialEntitiesToNCR
* @param result
* @param i
* @return
*/
private static int convertToUnicode(String s, boolean domCreation, boolean recognizeUnicodeChars, boolean translateSpecialEntitiesToNCR, StringBuilder result, int i) {
StringBuilder unicode = new StringBuilder();
int charIndex = extractCharCode(s, i, true, unicode);
if (unicode.length() > 0) {
try {
boolean isHex = unicode.substring(0,1).equals("x");
//
// Get the unicode character and code point
//
int codePoint = -1;
char[] unicodeChar = null;
if (isHex){
codePoint = Integer.parseInt(unicode.substring(1), 16);
} else {
codePoint = Integer.parseInt(unicode.toString());
}
unicodeChar = Character.toChars(codePoint);
SpecialEntity specialEntity = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(codePoint);
if (unicodeChar.length == 1 && unicodeChar[0] == 0) {
// null character &#0Peanut for example
// just consume character &
result.append("&amp;");
} else if ( specialEntity != null &&
// special characters that are always escaped.
(!specialEntity.isHtmlSpecialEntity()
// OR we are not outputting unicode characters as the characters ( they are staying escaped )
|| !recognizeUnicodeChars)) {
result.append(domCreation? specialEntity.getHtmlString():
(translateSpecialEntitiesToNCR? (isHex? specialEntity.getHexNCR(): specialEntity.getDecimalNCR()) :
specialEntity.getEscapedXmlString()));
} else if ( recognizeUnicodeChars ) {
// output unicode characters as their actual byte code with the exception of characters that have special xml meaning.
result.append( String.valueOf(unicodeChar));
} else if ( ASCII_CHAR.matcher(new String(unicodeChar)).find()) {
// ascii printable character. this fancy escaping might be an attempt to slip in dangerous characters (i.e. spelling out <script> )
// by converting to printable characters we can more easily detect such attacks.
result.append(String.valueOf(unicodeChar));
} else {
result.append( "&#").append(unicode).append(";" );
}
} catch (NumberFormatException e) {
// should never happen now
result.append("&amp;#").append(unicode).append(";" );
}
catch (IllegalArgumentException e) {
// code point is not a legal unicode character
result.append("&amp;#").append(unicode).append(";" );
}
} else {
result.append("&amp;");
}
return charIndex;
}
// TODO have pattern consume leading 0's and discard.
public static Pattern HEX_STRICT = Pattern.compile("^([x|X][\\p{XDigit}]+)(;?)");
public static Pattern HEX_RELAXED = Pattern.compile("^0*([x|X][\\p{XDigit}]+)(;?)");
public static Pattern DECIMAL = Pattern.compile("^([\\p{Digit}]+)(;?)");
/**
* <ul>
* <li>(earlier code was failing on this) - &#138A; is converted by FF to 3 characters: &#138; + 'A' + ';'</li>
* <li>&#0x138A; is converted by FF to 6? 7? characters: &#0 'x'+'1'+'3'+ '8' + 'A' + ';'
* #0 is displayed kind of weird</li>
* <li>&#x138A; is a single character</li>
* </ul>
*
* @param s
* @param charIndex
* @param relaxedUnicode '&#0x138;' is treated like '&#x138;'
* @param unicode
* @return the index to continue scanning the source string -1 so normal loop incrementing skips the ';'
*/
private static int extractCharCode(String s, int charIndex, boolean relaxedUnicode, StringBuilder unicode) {
int len = s.length();
CharSequence subSequence = s.subSequence(charIndex, Math.min(len,charIndex+15));
Matcher matcher;
if( relaxedUnicode ) {
matcher = HEX_RELAXED.matcher(subSequence);
} else {
matcher = HEX_STRICT.matcher(subSequence);
}
// silly note: remember calling find() twice finds second match :-)
if (matcher.find() || ((matcher = DECIMAL.matcher(subSequence)).find())) {
// -1 so normal loop incrementing skips the ';'
charIndex += matcher.end() -1;
unicode.append(matcher.group(1));
}
return charIndex;
}
public static String sanitizeXmlIdentifier(String attName){
return sanitizeXmlIdentifier(attName, "hc-generated-","");
}
public static String sanitizeXmlIdentifier(String attName, String prefix){
return sanitizeXmlIdentifier(attName, prefix,"");
}
public static String sanitizeHtmlAttributeName(String name){
// Attribute names must consist of one or more characters other than controls,
// U+0020 SPACE, U+0022 ("), U+0027 ('), U+003E (>), U+002F (/), U+003D (=), and noncharacters.
String regex = "[\\u0000\\u0020\\u0022\\u0027\\u003E\\u002F\\u003d]";
Pattern pattern = compileUnicodePattern(regex);
final Matcher matcher = pattern.matcher(name);
name = matcher.replaceAll("");
return name;
}
public static boolean isValidHtmlAttributeName(String name){
String regex = "^[^\\u0000\\u0020\\u0022\\u0027\\u003E\\u002F\\u003d]+$";
Pattern pattern = compileUnicodePattern(regex);
final Matcher matcher = pattern.matcher(name);
return matcher.find();
}
/**
* Attempts to replace invalid attribute names with valid ones.
* @param attName the attribute name to fix
* @param prefix the prefix to use to indicate an attribute name has been altered
* @return either the original attribute name if valid, or a generated identifier if not
*/
public static String sanitizeXmlIdentifier(String attName, String prefix, String replacementCharacter){
if (Utils.isValidXmlIdentifier(attName)) return attName;
//
// Prepend with "hc-generated-" or similar prefix. Useful for
// identifiers that are valid apart from the start character, e.g "1a"
//
if (!Utils.isValidXmlIdentifierStartChar(attName.substring(0,1))){
if (!prefix.isEmpty()){
String generatedAttName = prefix + attName;
if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
} else {
//
// If not, strip out first character
//
String generatedAttName = attName.substring(1);
if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
}
}
//
// otherwise, replace or strip out invalid characters
//
String generatedAttName = Utils.replaceInvalidXmlIdentifierCharacters(attName,"");
if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
//
// If we still have something invalid - for example none of the characters in
// it are valid - then return null
//
return null;
}
/**
* Checks whether specified string can be valid tag name or attribute name in xml.
* @param s String to be checked
* @return True if string is valid xml identifier, false otherwise
*/
public static boolean isValidXmlIdentifier(String s) {
if (s == null) return false;
Matcher matcher = VALID_XML_IDENTIFIER_CHAR_PATTERN.matcher(s);
if (matcher.find()){
s = null;
matcher = null;
return true;
}
return false;
}
/**
* @param o
* @return True if specified string is null of contains only whitespace characters
*/
public static boolean isEmptyString(Object o) {
if ( o == null ) {
return true;
}
String s = o.toString();
String text = escapeXml(s, true, false, false, false, false, false, false);
// TODO: doesn't escapeXml handle this?
String last = text.replace(SpecialEntities.NON_BREAKABLE_SPACE, ' ').trim();
return last.length() == 0;
}
public static String[] tokenize(String s, String delimiters) {
if (s == null) {
return new String[] {};
}
StringTokenizer tokenizer = new StringTokenizer(s, delimiters);
String result[] = new String[tokenizer.countTokens()];
int index = 0;
while (tokenizer.hasMoreTokens()) {
result[index++] = tokenizer.nextToken();
}
return result;
}
public static boolean isXmlReservedCharacter(String c){
final String XML_CHARS="'\"<>&";
return XML_CHARS.contains(c);
}
/**
* @param name
* @return For xml element name or attribute name returns prefix (part before :) or null if there is no prefix
*/
public static String getXmlNSPrefix(String name) {
int colIndex = name.indexOf(':');
if (colIndex > 0) {
return name.substring(0, colIndex);
}
return null;
}
/**
* @param name
* @return For xml element name or attribute name returns name after prefix (part after :)
*/
public static String getXmlName(String name) {
int colIndex = name.indexOf(':');
if (colIndex > 0 && colIndex < name.length() - 1) {
return name.substring(colIndex + 1);
}
return name;
}
static boolean isValidInt(String s, int radix) {
try {
Integer.parseInt(s, radix);
return true;
} catch (NumberFormatException e) {
return false;
}
}
/**
* Trims specified string from left.
* @param s
*/
public static String ltrim(String s) {
if (s == null) {
return null;
}
int index = 0;
int len = s.length();
while ( index < len && Character.isWhitespace(s.charAt(index)) ) {
index++;
}
return (index >= len) ? "" : s.substring(index);
}
/**
* Trims specified string from right.
* @param s
*/
public static String rtrim(String s) {
if (s == null) {
return null;
}
int len = s.length();
int index = len;
while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) {
index--;
}
return (index <= 0) ? "" : s.substring(0, index);
}
/**
* Checks whether specified object's string representation is empty string (containing of only whitespaces).
* @param object Object whose string representation is checked
* @return true, if empty string, false otherwise
*/
public static boolean isWhitespaceString(Object object) {
if (object != null) {
String s = object.toString();
return s != null && "".equals(s.trim());
}
return false;
}
//
// Replaces entities with actual characters
//
public static String deserializeEntities(String str, boolean recognizeUnicodeChars) {
StringBuffer buf = new StringBuffer(str);
SpecialEntities entities = SpecialEntities.INSTANCE;
int entityStart = -1;
boolean numericEntity = false;
boolean hexEntity = false;
int maxEntityLength = entities.getMaxEntityLength();
int i = 0;
int length = buf.length();
while (i < length) {
if (buf.charAt(i) == '&') {
entityStart = i;
numericEntity = false;
hexEntity = false;
++i;
} else if (entityStart != -1) {
if (buf.charAt(i) == ';') {
int entityValue = -1;
if (numericEntity) {
try {
entityValue = Integer.parseInt(
buf.substring(
entityStart + (hexEntity ? 3 : 2),
i
),
hexEntity ? 16 : 10
);
} catch (NumberFormatException e) {
entityValue = -1;
}
SpecialEntity entity = entities.getSpecialEntityByUnicode(entityValue);
if(entity != null)
entityValue = entity.intValue();
else if(!recognizeUnicodeChars)
entityValue = -1;
} else {
SpecialEntity entity = entities.getSpecialEntity(buf.substring(entityStart + 1, i));
if(entity != null)
entityValue = entity.intValue();
}
if (entityValue >= 0) {
char[] decodedEntity = Character.toChars(entityValue);
buf.replace(entityStart, i + 1, new String(decodedEntity));
length = buf.length();
i = entityStart + decodedEntity.length;
} else {
++i;
}
entityStart = -1;
} else {
if (i == entityStart + 1 && buf.charAt(i) == '#') {
numericEntity = true;
} else if (i == entityStart + 2 && numericEntity && buf.charAt(i) == 'x') {
hexEntity = true;
} else if (i - entityStart > maxEntityLength) {
entityStart = -1;
}
++i;
}
} else {
++i;
}
}
return buf.toString();
}
/**
* Determines whether the initial character of an identifier is valid for XML
* @param identifier the identifier to check
* @return true is the intial character is valid
*/
public static boolean isValidXmlIdentifierStartChar(String identifier){
final Matcher matcher = VALID_XML_IDENTIFIER_START_CHAR_PATTERN.matcher(identifier);
return matcher.find();
}
/**
* Strips out invalid characters from names used for XML Elements and replaces them with the specified
* character.
*
* For example, "<p%>" becomes "<p_>"
* @param name
* @return valid XML name
*/
public static String replaceInvalidXmlIdentifierCharacters(String name, String replacement){
final String regex_repl = ""
+ "[^:A-Z_a-z\\u00C0\\u00D6\\u00D8-\\u00F6"
+ "\\u00F8-\\u02ff\\u0370-\\u037d\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f"
+ "\\u2c00-\\u2fef\\u3001-\\udfff\\uf900-\\ufdcf\\ufdf0-\\ufffd\\-\\.0-9"
+ "\\u00b7\\u0300-\\u036f\\u203f-\\u2040]";
final Pattern pattern = compileUnicodePattern(regex_repl);
final Matcher matcher = pattern.matcher(name);
name = matcher.replaceAll(replacement);
return name;
}
private static Pattern compileUnicodePattern(String pattern){
try {
return Pattern.compile(pattern, Pattern.UNICODE_CHARACTER_CLASS);
} catch(IllegalArgumentException ex) {
return Pattern.compile(pattern);
}
}
}
+612
View File
@@ -0,0 +1,612 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.StringTokenizer;
/**
* <p>Utility for searching cleaned document tree with XPath expressions.</p>
* Examples of supported axes:
* <code>
* <ul>
* <li>//div//a</li>
* <li>//div//a[@id][@class]</li>
* <li>/body/*[1]/@type</li>
* <li>//div[3]//a[@id][@href='r/n4']</li>
* <li>//div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a</li>
* <li>//div[2]/@*[2]</li>
* <li>data(//div//a[@id][@class])</li>
* <li>//p/last()</li>
* <li>//body//div[3][@class]//span[12.2<position()]/@id</li>
* <li>data(//a['v' < @id])</li>
* </ul>
* </code>
*/
public class XPather {
private static final int C0 = '0';
private static final int C9 = '9';
private static final int CD = '.';
private static final int CP = '+';
private static final int CM = '-';
private static final int CS = ' ';
// array of basic tokens of which XPath expression is made
private String tokenArray[];
/**
* Constructor - creates XPather instance with specified XPath expression.
* @param expression
*/
public XPather(String expression) {
StringTokenizer tokenizer = new StringTokenizer(expression, "/()[]\"'=<>", true);
int tokenCount = tokenizer.countTokens();
tokenArray = new String[tokenCount];
int index = 0;
// this is not real XPath compiler, rather simple way to recognize basic XPaths expressions
// and interpret them against some TagNode instance.
while (tokenizer.hasMoreTokens()) {
tokenArray[index++] = tokenizer.nextToken();
}
}
/**
* Main public method for this class - a way to execute XPath expression against
* specified TagNode instance.
* @param node
*/
public Object[] evaluateAgainstNode(TagNode node) throws XPatherException {
if (node == null) {
throw new XPatherException("Cannot evaluate XPath expression against null value!");
}
Collection collectionResult = evaluateAgainst(singleton(node), 0, tokenArray.length - 1, false, 1, 0, false, null);
Object[] array = new Object[collectionResult.size()];
Iterator iterator = collectionResult.iterator();
int index = 0;
while (iterator.hasNext()) {
array[index++] = iterator.next();
}
return array;
}
private void throwStandardException() throws XPatherException {
throw new XPatherException();
}
protected Collection evaluateAgainst(Collection object,
int from,
int to,
boolean isRecursive,
int position,
int last,
boolean isFilterContext,
Collection filterSource) throws XPatherException {
if (from >= 0 && to < tokenArray.length && from <= to) {
if ("".equals(tokenArray[from].trim())) {
return evaluateAgainst(object, from + 1, to, isRecursive, position, last, isFilterContext, filterSource);
} else if (isToken("(", from)) {
int closingBracket = findClosingIndex(from, to);
if (closingBracket > 0) {
Collection value = evaluateAgainst(object, from + 1, closingBracket - 1, false, position, last, isFilterContext, filterSource);
return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, filterSource);
} else {
throwStandardException();
}
} else if (isToken("[", from)) {
int closingBracket = findClosingIndex(from, to);
if (closingBracket > 0 && object != null) {
Collection value = filterByCondition(object, from + 1, closingBracket - 1);
return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, filterSource);
} else {
throwStandardException();
}
} else if (isToken("\"", from) || isToken("'", from)) { // string constant
int closingQuote = findClosingIndex(from, to);
if (closingQuote > from) {
Collection value = singleton( flatten(from + 1, closingQuote - 1) );
return evaluateAgainst(value, closingQuote + 1, to, false, position, last, isFilterContext, filterSource);
} else {
throwStandardException();
}
} else if ( (isToken("=", from) || isToken("<", from) || isToken(">", from)) && isFilterContext ) { // operator inside filter
boolean logicValue;
if ( isToken("=", from + 1) && (isToken("<", from) || isToken(">", from)) ) {
Collection secondObject = evaluateAgainst(filterSource, from + 2, to, false, position, last, isFilterContext, filterSource);
logicValue = evaluateLogic(object, secondObject, tokenArray[from] + tokenArray[from + 1]);
} else {
Collection secondObject = evaluateAgainst(filterSource, from + 1, to, false, position, last, isFilterContext, filterSource);
logicValue = evaluateLogic(object, secondObject, tokenArray[from]);
}
return singleton(new Boolean(logicValue));
} else if (isToken("/", from)) { // children of the node
boolean goRecursive = isToken("/", from + 1);
if (goRecursive) {
from++;
}
if ( from < to ) {
int toIndex = findClosingIndex(from, to) - 1;
if (toIndex <= from) {
toIndex = to;
}
Collection value = evaluateAgainst(object, from + 1, toIndex, goRecursive, 1, last, isFilterContext, filterSource);
return evaluateAgainst(value, toIndex + 1, to, false, 1, last, isFilterContext, filterSource);
} else {
throwStandardException();
}
} else if (isFunctionCall(from, to)) {
int closingBracketIndex = findClosingIndex(from + 1, to);
Collection funcValue = evaluateFunction(object, from, to, position, last, isFilterContext);
return evaluateAgainst(funcValue, closingBracketIndex + 1, to, false, 1, last, isFilterContext, filterSource);
} else if (isValidInteger(tokenArray[from])) {
Collection value = singleton(Integer.valueOf(tokenArray[from]));
return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
} else if (isValidDouble(tokenArray[from])) {
Collection value = singleton(Double.valueOf(tokenArray[from]));
return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
} else {
return getElementsByName(object, from, to, isRecursive, isFilterContext);
}
} else {
return object;
}
throw new XPatherException();
}
private String flatten(int from, int to) {
if (from <= to) {
StringBuffer result = new StringBuffer();
for (int i = from; i <= to; i++) {
result.append(tokenArray[i]);
}
return result.toString();
}
return "";
}
private static boolean isValidInteger(String value) {
final int l = value.length();
if(l > 0) {
int i = 1, c = value.charAt(0);
if(c == CP || c == CM || (c >= C0 && c <= C9)) {
for (; i < l; i++) {
c = value.charAt(i);
if (c < C0 || c > C9)
return false;
}
return true;
}
}
return false;
}
private boolean isValidDouble(String value) {
final int l = value.length();
if(l > 0) {
int i = 1, c = value.charAt(0);
if(c == CP || c == CM || c == CS || (c >= C0 && c <= C9)) {
for (; i < l; i++) {
c = value.charAt(i);
if (c != CD && (c < C0 || c > C9))
return false;
}
return true;
}
}
return false;
}
/**
* Checks if given string is valid identifier.
* @param s
*/
private boolean isIdentifier(String s) {
if (s == null) {
return false;
}
s = s.trim();
if (s.length() > 0) {
if ( !Character.isLetter(s.charAt(0)) ) {
return false;
}
for (int i = 1; i < s.length(); i++) {
final char ch = s.charAt(i);
if ( ch != '_' && ch != '-' && !Character.isLetterOrDigit(ch) ) {
return false;
}
}
}
return false;
}
/**
* Checks if tokens in specified range represents valid function call.
* @param from
* @param to
* @return True if it is valid function call, false otherwise.
*/
private boolean isFunctionCall(int from, int to) {
if ( !isIdentifier(tokenArray[from]) && !isToken("(", from + 1) ) {
return false;
}
return findClosingIndex(from + 1, to) > from + 1;
}
/**
* Evaluates specified function.
* Currently, following XPath functions are supported: last, position, text, count, data
* @param source
* @param from
* @param to
* @param position
* @param last
* @return Collection as the result of evaluation.
*/
protected Collection evaluateFunction(Collection source,
int from,
int to,
int position,
int last,
boolean isFilterContext) throws XPatherException {
String name = tokenArray[from].trim();
ArrayList result = new ArrayList();
final int size = source.size();
Iterator iterator = source.iterator();
int index = 0;
while (iterator.hasNext()) {
Object curr = iterator.next();
index++;
if ( "last".equals(name) ) {
result.add( Integer.valueOf(isFilterContext ? last : size) );
} else if ( "position".equals(name) ) {
result.add( Integer.valueOf(isFilterContext ? position : index) );
} else if ( "text".equals(name) ) {
if (curr instanceof TagNode) {
result.add( ((TagNode)curr).getText() );
} else if (curr instanceof String) {
result.add( curr.toString() );
}
} else if ( "count".equals(name) ) {
Collection argumentEvaluated =
evaluateAgainst(source, from + 2, to - 1, false, position, 0, isFilterContext, null);
result.add( Integer.valueOf(argumentEvaluated.size()) );
} else if ( "data".equals(name) ) {
Collection argumentEvaluated = evaluateAgainst(source, from + 2, to - 1, false, position, 0, isFilterContext, null);
Iterator it = argumentEvaluated.iterator();
while (it.hasNext()) {
Object elem = it.next();
if (elem instanceof TagNode) {
result.add( ((TagNode)elem).getText() );
} else if (elem instanceof String) {
result.add( elem.toString() );
}
}
} else {
throw new XPatherException("Unknown function " + name + "!");
}
}
return result;
}
/**
* Filter nodes satisfying the condition
* @param source
* @param from
* @param to
*/
protected Collection filterByCondition(Collection source, int from, int to) throws XPatherException {
ArrayList result = new ArrayList();
Iterator iterator = source.iterator();
int index = 0;
int size = source.size();
while (iterator.hasNext()) {
Object curr = iterator.next();
index++;
ArrayList logicValueList = new ArrayList(evaluateAgainst(singleton(curr), from, to, false, index, size, true, singleton(curr)));
if (logicValueList.size() >= 1) {
Object first = logicValueList.get(0);
if (first instanceof Boolean) {
if ( ((Boolean)first).booleanValue() ) {
result.add(curr);
}
} else if (first instanceof Integer) {
if ( ((Integer)first).intValue() == index ) {
result.add(curr);
}
} else {
result.add(curr);
}
}
}
return result;
}
private boolean isToken(String token, int index) {
int len = tokenArray.length;
return index >= 0 && index < len && tokenArray[index].trim().equals(token.trim());
}
/**
* @param from
* @param to
* @return matching closing index in the token array for the current token, or -1 if there is
* no closing token within expected bounds.
*/
private int findClosingIndex(int from, int to) {
if (from < to) {
String currToken = tokenArray[from];
if ("\"".equals(currToken)) {
for (int i = from + 1; i <= to; i++) {
if ("\"".equals(tokenArray[i])) {
return i;
}
}
} else if ("'".equals(currToken)) {
for (int i = from + 1; i <= to; i++) {
if ("'".equals(tokenArray[i])) {
return i;
}
}
} else if ( "(".equals(currToken) || "[".equals(currToken) || "/".equals(currToken) ) {
boolean isQuoteClosed = true;
boolean isAposClosed = true;
int brackets = "(".equals(currToken) ? 1 : 0;
int angleBrackets = "[".equals(currToken) ? 1 : 0;
int slashes = "/".equals(currToken) ? 1 : 0;
for (int i = from + 1; i <= to; i++) {
if ( "\"".equals(tokenArray[i]) ) {
isQuoteClosed = !isQuoteClosed;
} else if ( "'".equals(tokenArray[i]) ) {
isAposClosed = !isAposClosed;
} else if ( "(".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
brackets++;
} else if ( ")".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
brackets--;
} else if ( "[".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
angleBrackets++;
} else if ( "]".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
angleBrackets--;
} else if ( "/".equals(tokenArray[i]) && isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0) {
slashes--;
}
if (isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0 && slashes == 0) {
return i;
}
}
}
}
return -1;
}
/**
* Checks if token is attribute (starts with @)
* @param token
*/
private boolean isAtt(String token) {
return token != null && token.length() > 1 && token.startsWith("@");
}
/**
* Creates one-element collection for the specified object.
* @param element
*/
private Collection singleton(Object element) {
ArrayList result = new ArrayList();
result.add(element);
return result;
}
/**
* For the given source collection and specified name, returns collection of subnodes
* or attribute values.
* @param source
* @param from
* @param to
* @param isRecursive
* @return Colection of TagNode instances or collection of String instances.
*/
private Collection getElementsByName(Collection source, int from, int to, boolean isRecursive, boolean isFilterContext) throws XPatherException {
String name = tokenArray[from].trim();
if (isAtt(name)) {
name = name.substring(1);
Collection result = new ArrayList();
Collection nodes;
if (isRecursive) {
nodes = new LinkedHashSet();
Iterator iterator = source.iterator();
while (iterator.hasNext()) {
Object next = iterator.next();
if (next instanceof TagNode) {
TagNode node = (TagNode) next;
nodes.addAll( node.getAllElementsList(true) );
}
}
} else {
nodes = source;
}
Iterator iterator = nodes.iterator();
while (iterator.hasNext()) {
Object next = iterator.next();
if (next instanceof TagNode) {
TagNode node = (TagNode) next;
if ("*".equals(name)) {
result.addAll( evaluateAgainst(node.getAttributes().values(), from + 1, to, false, 1, 1, isFilterContext, null) );
} else {
String attValue = node.getAttributeByName(name);
if (attValue != null) {
result.addAll( evaluateAgainst(singleton(attValue), from + 1, to, false, 1, 1, isFilterContext, null) );
}
}
} else {
throwStandardException();
}
}
return result;
} else {
Collection result = new LinkedHashSet();
Iterator iterator = source.iterator();
int index = 0;
while (iterator.hasNext()) {
final Object next = iterator.next();
if (next instanceof TagNode) {
TagNode node = (TagNode) next;
index++;
boolean isSelf = ".".equals(name);
boolean isParent = "..".equals(name);
boolean isAll = "*".equals(name);
Collection subnodes;
if (isSelf) {
subnodes = singleton(node);
} else if (isParent) {
TagNode parent = node.getParent();
subnodes = parent != null ? singleton(parent) : new ArrayList();
} else {
subnodes = isAll ? node.getChildTagList() : node.getElementListByName(name, false);
}
LinkedHashSet nodeSet = new LinkedHashSet(subnodes);
Collection refinedSubnodes = evaluateAgainst(nodeSet, from + 1, to, false, index, nodeSet.size(), isFilterContext, null);
if (isRecursive) {
List childTags = node.getChildTagList();
if (isSelf || isParent || isAll) {
result.addAll(refinedSubnodes);
}
Iterator childIterator = childTags.iterator();
while (childIterator.hasNext()) {
TagNode childTag = (TagNode) childIterator.next();
Collection childrenByName = getElementsByName(singleton(childTag), from, to, isRecursive, isFilterContext);
if ( !isSelf && !isParent && !isAll && refinedSubnodes.contains(childTag) ) {
result.add(childTag);
}
result.addAll(childrenByName);
}
} else {
result.addAll(refinedSubnodes);
}
} else {
throwStandardException();
}
}
return result;
}
}
/**
* Evaluates logic operation on two collections.
* @param first
* @param second
* @param logicOperator
* @return Result of logic operation
*/
protected boolean evaluateLogic(Collection first, Collection second, String logicOperator) {
if (first == null || first.size() == 0 || second == null || second.size() == 0) {
return false;
}
Object elem1 = first.iterator().next();
Object elem2 = second.iterator().next();
if (elem1 instanceof Number && elem2 instanceof Number) {
double d1 = ((Number)elem1).doubleValue();
double d2 = ((Number)elem2).doubleValue();
if ("=".equals(logicOperator)) {
return d1 == d2;
} else if ("<".equals(logicOperator)) {
return d1 < d2;
} else if (">".equals(logicOperator)) {
return d1 > d2;
} else if ("<=".equals(logicOperator)) {
return d1 <= d2;
} else if (">=".equals(logicOperator)) {
return d1 >= d2;
}
} else {
String s1 = toText(elem1);
String s2 = toText(elem2);
int result = s1.compareTo(s2);
if ("=".equals(logicOperator)) {
return result == 0;
} else if ("<".equals(logicOperator)) {
return result < 0;
} else if (">".equals(logicOperator)) {
return result > 0;
} else if ("<=".equals(logicOperator)) {
return result <= 0;
} else if (">=".equals(logicOperator)) {
return result >= 0;
}
}
return false;
}
private String toText(Object o) {
if (o == null) {
return "";
} if (o instanceof TagNode) {
return ((TagNode)o).getText().toString();
} else {
return o.toString();
}
}
}
@@ -0,0 +1,62 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
/**
* <p>Exception that could occure during XPather evaluation.</p>
*/
public class XPatherException extends Exception {
public XPatherException() {
this("Error in evaluating XPath expression!");
}
public XPatherException(Throwable cause) {
super(cause);
}
public XPatherException(String message) {
super(message);
}
public XPatherException(String message, Throwable cause) {
super(message, cause);
}
}
@@ -0,0 +1,313 @@
/* Copyright (c) 2006-2007, Vladimir Nikic
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import java.io.*;
import java.util.*;
/**
* <p>Abstract XML serializer - contains common logic for descendants.</p>
*/
public abstract class XmlSerializer extends Serializer {
public static final String XMLNS_NAMESPACE = "xmlns";
protected XmlSerializer(CleanerProperties props) {
super(props);
}
private boolean creatingHtmlDom;
/**
* @param creatingHtmlDom the creatingHtmlDom to set
*/
public void setCreatingHtmlDom(boolean creatingHtmlDom) {
this.creatingHtmlDom = creatingHtmlDom;
}
/**
* @return the creatingHtmlDom
*/
public boolean isCreatingHtmlDom() {
return creatingHtmlDom;
}
/**
* @deprecated Use writeToStream() instead.
*/
@Deprecated
public void writeXmlToStream(TagNode tagNode, OutputStream out, String charset) throws IOException {
super.writeToStream(tagNode, out, charset);
}
/**
* @deprecated Use writeToStream() instead.
*/
@Deprecated
public void writeXmlToStream(TagNode tagNode, OutputStream out) throws IOException {
super.writeToStream(tagNode, out);
}
/**
* @deprecated Use writeToFile() instead.
*/
@Deprecated
public void writeXmlToFile(TagNode tagNode, String fileName, String charset) throws IOException {
super.writeToFile(tagNode, fileName, charset);
}
/**
* @deprecated Use writeToFile() instead.
*/
@Deprecated
public void writeXmlToFile(TagNode tagNode, String fileName) throws IOException {
super.writeToFile(tagNode, fileName);
}
/**
* @deprecated Use getAsString() instead.
*/
@Deprecated
public String getXmlAsString(TagNode tagNode, String charset) {
return super.getAsString(tagNode, charset);
}
/**
* @deprecated Use getAsString() instead.
*/
@Deprecated
public String getXmlAsString(TagNode tagNode) {
return super.getAsString(tagNode);
}
/**
* @deprecated Use write() instead.
*/
@Deprecated
public void writeXml(TagNode tagNode, Writer writer, String charset) throws IOException {
super.write(tagNode, writer, charset);
}
protected String escapeXml(String xmlContent) {
return Utils.escapeXml(xmlContent, props, isCreatingHtmlDom());
}
protected boolean dontEscape(TagNode tagNode) {
return props.isUseCdataFor(tagNode.getName());
}
protected boolean isMinimizedTagSyntax(TagNode tagNode) {
final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
return tagNode.isEmpty() && (tagInfo == null || tagInfo.isMinimizedTagPermitted()) &&
( props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag()) );
}
protected void serializeOpenTag(TagNode tagNode, Writer writer) throws IOException {
serializeOpenTag(tagNode, writer, true);
}
/**
* Serialize a CDATA section. If the context is a script or style tag, and
* using CDATA for script and style is set to true, then we just write the
* actual content, as the whole section is wrapped in CDATA tokens.
* Otherwise we escape the content as if it were regular text.
*
* @param item the CDATA instance
* @param tagNode the TagNode within which the CDATA appears
* @param writer the writer to output to
* @throws IOException
*/
protected void serializeCData(CData item, TagNode tagNode, Writer writer) throws IOException{
if (dontEscape(tagNode)){
writer.write(item.getContentWithoutStartAndEndTokens());
} else {
writer.write(escapeXml(item.getContentWithStartAndEndTokens()));
}
}
/**
* Serialize a content token, escaping where necessary.
* @param item the content token to serialize
* @param tagNode the TagNode within which the content token appears
* @param writer the writer to output to
* @throws IOException
*/
protected void serializeContentToken(ContentNode item, TagNode tagNode, Writer writer) throws IOException {
if (dontEscape(tagNode)){
writer.write(item.getContent());
}else {
writer.write( escapeXml(item.getContent()) );
}
}
protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
if ( !isForbiddenTag(tagNode)) {
String tagName = tagNode.getName();
//
// Ensure we use valid XML element names
//
tagName = Utils.sanitizeXmlIdentifier(tagName);
Map<String, String> tagAtttributes = tagNode.getAttributes();
// always have head and body in newline
if (props.isAddNewlineToHeadAndBody() && isHeadOrBody(tagName)) {
writer.write("\n");
}
writer.write("<" + tagName);
Iterator<Map.Entry<String, String>> it = tagAtttributes.entrySet().iterator();
while (it.hasNext()) {
Map.Entry<String, String> entry = (Map.Entry<String,String>) it.next();
String attName = (String) entry.getKey();
String attValue = (String) entry.getValue();
serializeAttribute(tagNode, writer, attName, attValue);
}
if ( isMinimizedTagSyntax(tagNode) ) {
writer.write(" />");
if (newLine) {
writer.write("\n");
}
} else if (dontEscape(tagNode)) {
// because we are not considering if the file is xhtml or html,
// we need to put a javascript comment in front of the CDATA in case this is NOT xhtml
writer.write(">");
if (!tagNode.getText().toString().startsWith(CData.SAFE_BEGIN_CDATA)) {
writer.write(CData.SAFE_BEGIN_CDATA);
//
// Insert a newline after the CDATA start marker if there isn't
// already a newline character there
//
if (!tagNode.getText().toString().equals("")){
char firstchar = tagNode.getText().toString().charAt(0);
if (firstchar != '\n' && firstchar !='\r') writer.write("\n");
}
}
} else {
writer.write(">");
}
}
}
/**
* @param tagNode
* @return true if the tag is forbidden
*/
protected boolean isForbiddenTag(TagNode tagNode) {
// null tagName when rootNode is a dummy node.
// this happens when omitting the html envelope elements ( <html>, <head>, <body> elements )
String tagName = tagNode.getName();
return tagName == null;
}
protected boolean isHeadOrBody(String tagName) {
return "head".equalsIgnoreCase(tagName) || "body".equalsIgnoreCase(tagName);
}
/**
* This allows overriding to eliminate forbidden attributes (for example javascript attributes onclick, onblur, etc. )
* @param writer
* @param attName
* @param attValue
* @throws IOException
*/
protected void serializeAttribute(TagNode tagNode, Writer writer, String attName, String attValue) throws IOException {
//
// For XML, we can't use the lax definition of attribute names used in HTML5, so
// we have to replace any invalid ones with a generated attribute name, or skip
// them entirely.
//
if (!props.isAllowInvalidAttributeNames()){
attName = Utils.sanitizeXmlIdentifier(attName, props.getInvalidXmlAttributeNamePrefix());
}
if (attName != null && (Utils.isValidXmlIdentifier(attName) || props.isAllowInvalidAttributeNames()) && !isForbiddenAttribute(tagNode, attName, attValue)) {
writer.write(" " + attName + "=\"" + escapeXml(attValue) + "\"");
}
}
/**
* Override to add additional conditions.
* @param tagNode
* @param attName
* @param value
* @return true if the attribute should not be outputed.
*/
protected boolean isForbiddenAttribute(TagNode tagNode, String attName, String value) {
return !props.isNamespacesAware() && (XMLNS_NAMESPACE.equals(attName) || attName.startsWith(XMLNS_NAMESPACE +":"));
}
protected void serializeEndTag(TagNode tagNode, Writer writer) throws IOException {
serializeEndTag(tagNode, writer, true);
}
protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
if ( !isForbiddenTag(tagNode)) {
String tagName = tagNode.getName();
//
// Ensure we use valid XML element names
//
tagName = Utils.sanitizeXmlIdentifier(tagName);
if (dontEscape(tagNode)) {
// because we are not considering if the file is xhtml or html,
// we need to put a javascript comment in front of the CDATA in case this is NOT xhtml
if (!tagNode.getText().toString().trim().endsWith(CData.SAFE_END_CDATA)) {
//
// Insert a newline character before the CDATA end marker if there isn't one
// already at the end of the tag node content
//
if (tagNode.getText().toString().length() > 0){
char lastchar = tagNode.getText().toString().charAt(tagNode.getText().toString().length()-1);
if (lastchar != '\n' && lastchar !='\r') writer.write("\n");
}
// Write the CDATA end marker
writer.write(CData.SAFE_END_CDATA);
}
}
writer.write( "</" + tagName + ">" );
if (newLine) {
writer.write("\n");
}
}
}
}
@@ -0,0 +1,48 @@
package org.htmlcleaner;
import java.util.Iterator;
import java.util.List;
/**
* Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
* <p>
* This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
* </p>
*/
public class XmlTraversor {
private XmlVisitor visitor;
/**
* Start a depth-first traverse of the root and all of its descendants.
* @param visitor Node visitor.
* @param root the root node point to traverse.
*/
public static void traverse(XmlVisitor visitor, HtmlNode root) {
HtmlNode node = root;
int depth = 0;
while (node != null) {
visitor.head(node, depth);
if ( node instanceof TagNode && ((TagNode)node).hasChildren() ) {
node = (HtmlNode)((TagNode)node).getAllChildren().get(0);
depth++;
} else {
List<? extends BaseToken> siblings = node.getSiblings();
Iterator<? extends BaseToken> it = siblings.iterator();
while (it.hasNext() && it.next() == null && depth > 0) {
visitor.tail(node, depth);
node = node.getParent();
depth--;
}
visitor.tail(node, depth);
if (node == root)
break;
if (it.hasNext()){
node = (HtmlNode)it.next();
} else {
node = null;
}
}
}
}
}
@@ -0,0 +1,29 @@
package org.htmlcleaner;
/**
* Node visitor interface. Provide an implementing class to {@link XmlTraversor} to iterate through nodes.
* <p>
* This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
* seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
* create a start tag for a node, and tail to create the end tag.
* </p>
*/
public interface XmlVisitor {
/**
* Callback for when a node is first visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
*/
void head(HtmlNode node, int depth);
/**
* Callback for when a node is last visited, after all of its descendants have been visited.
*
* @param node the node being visited.
* @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
* of that will have depth 1.
*/
void tail(HtmlNode node, int depth);
}
@@ -0,0 +1,124 @@
/*
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner.audit;
/**
* Possible error codes (read messages) that cleaner uses to inform clients about reasons/actions that modification
* involves.
* @author Konstantin Burov (aectann@gmail.com)
*/
public enum ErrorType {
/**
* Tag which existence is <i>critical</i> for the current is missing. Most likely, current tag was pruned. Unlike
* the {@link #RequiredParentMissing} this reports the problem when cleaner removed the tag instead of creating as
* parent. See {@link org.htmlcleaner.TagInfo} for more detailed description of fatal and required tags.
* <p>
* <b>Example:</b>
* <ul>
* <li>&lt;option> tag without parent select
* <li>&lt;tr> tag without parent &lt;table>
* <li>...
* </ul>
*/
FatalTagMissing,
/**
* The tag wasn't found on list of allowed tags, thus it was removed.
*/
NotAllowedTag,
/**
* Missing parent tag was added for current (i.e. tbody for tr).
*/
RequiredParentMissing,
/**
* No matching close token was found for the open tag. Tag was closed automatically.
* <p>
* <b>Example:</b>
* <p>
* &lt;p>Some text..
* <p>
* Unclosed &lt;p> tag.
*/
UnclosedTag,
/**
* Second instance of an unique tag was found, most likely it was removed.
* <p>
* <b>Example:</b>
* <p>
*
* <pre>
* &lt;head>
* &lt;title>Some text&lt;/title>
* &lt;title>Some more text&lt;/title>
* &lt;/head>
* <p>
* </pre>
*/
UniqueTagDuplicated,
/**
* The tag was deprecated and current cleaner mode doesn't allows this. The tag was removed.
* <p>
* <b>Example:</b>
* <ul>
* <li>&lt;u>
* <li>&lt;s>
* <li>&lt;srtike>
* <li>....
* </ul>
*/
Deprecated,
/**
* This tag have bad child that shouldn't be here. Thus the tag is closed automatically to avoid such inclusion.
* <p>
* <b>Example:</b>
* <p>
* &lt;p>Some text &lt;table>...&lt;/table>&lt;p>
* <p>
* &lt;table> is not allowed to be child of &lt;p>, thus &lt;p> is closed before the &lt;table>
*/
UnpermittedChild,
/**
* The tag is unknown and current cleaner mode doesn't allows this. The tag was removed.
* <p>
* <b>Example:</b>
* <ul>
* <li>&lt;any>
* <li>&lt;tag>
* <li>....
* </ul>
*/
Unknown
}
@@ -0,0 +1,85 @@
/*
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner.audit;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.conditional.ITagNodeCondition;
/**
* Implementors can be registered on {@link org.htmlcleaner.CleanerProperties} to receive notifications about
* modifications made by html cleaner.
*
* @author Konstantin Burov (aectann@gmail.com)
*
*/
public interface HtmlModificationListener {
/**
* Fired when cleaner fixes some error in html syntax.
*
* @param certain - true if change made doesn't hurts end document.
* @param tagNode - problematic node.
* @param errorType
*/
void fireHtmlError(boolean certain, TagNode tagNode, ErrorType errorType);
/**
* Fired when cleaner fixes ugly html -- when syntax was correct but task was implemented by weird code.
* For example when deprecated tags are removed.
*
* @param certainty - true if change made doesn't hurts end document.
* @param tagNode - problematic node.
* @param errorType
*/
void fireUglyHtml(boolean certainty, TagNode tagNode, ErrorType errorType);
/**
* Fired when cleaner modifies html due to {@link ITagNodeCondition} match.
*
* @param condition that was applied to make the modification
* @param tagNode - problematic node.
*/
void fireConditionModification(ITagNodeCondition condition, TagNode tagNode);
/**
* Fired when cleaner modifies html due to user specified rules.
*
* @param certainty - true if change made doesn't hurts end document.
* @param tagNode - problematic node.
* @param errorType
*/
void fireUserDefinedModification(boolean certainty, TagNode tagNode, ErrorType errorType);
}
@@ -0,0 +1,32 @@
package org.htmlcleaner.audit;
import java.util.logging.Logger;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.conditional.ITagNodeCondition;
public class HtmlModificationListenerLogger implements HtmlModificationListener {
private Logger log;
public HtmlModificationListenerLogger(Logger log) {
this.log = log;
}
public void fireConditionModification(ITagNodeCondition condition, TagNode tagNode) {
this.log.info("fireConditionModification:"+condition+" at "+tagNode);
}
public void fireHtmlError(boolean safety, TagNode tagNode, ErrorType errorType) {
this.log.info("fireHtmlError:"+errorType+"("+safety+") at "+tagNode);
}
public void fireUglyHtml(boolean safety, TagNode tagNode, ErrorType errorType) {
this.log.info("fireConditionModification:"+errorType+"("+safety+") at "+tagNode);
}
public void fireUserDefinedModification(boolean safety, TagNode tagNode, ErrorType errorType) {
this.log.info("fireConditionModification"+errorType+"("+safety+") at "+tagNode);
}
}
@@ -0,0 +1,10 @@
package org.htmlcleaner.conditional;
import org.htmlcleaner.TagNode;
/**
* Used as base for different node checkers.
*/
public interface ITagNodeCondition {
public boolean satisfy(TagNode tagNode);
}
@@ -0,0 +1,12 @@
package org.htmlcleaner.conditional;
import org.htmlcleaner.TagNode;
/**
* All nodes.
*/
public class TagAllCondition implements ITagNodeCondition {
public boolean satisfy(TagNode tagNode) {
return true;
}
}
@@ -0,0 +1,18 @@
package org.htmlcleaner.conditional;
import org.htmlcleaner.TagNode;
/**
* Checks if node contains specified attribute.
*/
public class TagNodeAttExistsCondition implements ITagNodeCondition {
private String attName;
public TagNodeAttExistsCondition(String attName) {
this.attName = attName;
}
public boolean satisfy(TagNode tagNode) {
return tagNode == null ? false : tagNode.getAttributes().containsKey( attName.toLowerCase() );
}
}
@@ -0,0 +1,30 @@
package org.htmlcleaner.conditional;
import java.util.Map;
import java.util.regex.Pattern;
import org.htmlcleaner.TagNode;
/**
* Checks if node has specified attribute with specified value.
*/
public class TagNodeAttNameValueRegexCondition implements ITagNodeCondition {
private Pattern attNameRegex;
private Pattern attValueRegex;
public TagNodeAttNameValueRegexCondition(Pattern attNameRegex, Pattern attValueRegex) {
this.attNameRegex = attNameRegex;
this.attValueRegex = attValueRegex;
}
public boolean satisfy(TagNode tagNode) {
if (tagNode != null ) {
for(Map.Entry<String, String>entry: tagNode.getAttributes().entrySet()) {
if ( (attNameRegex == null || attNameRegex.matcher(entry.getKey()).find()) && (attValueRegex == null || attValueRegex.matcher( entry.getValue() ).find())) {
return true;
}
}
}
return false;
}
}
@@ -0,0 +1,28 @@
package org.htmlcleaner.conditional;
import org.htmlcleaner.TagNode;
/**
* Checks if node has specified attribute with specified value.
*/
public class TagNodeAttValueCondition implements ITagNodeCondition {
private String attName;
private String attValue;
private boolean isCaseSensitive;
public TagNodeAttValueCondition(String attName, String attValue, boolean isCaseSensitive) {
this.attName = attName;
this.attValue = attValue;
this.isCaseSensitive = isCaseSensitive;
}
public boolean satisfy(TagNode tagNode) {
if (tagNode == null || attName == null || attValue == null) {
return false;
} else {
return isCaseSensitive ?
attValue.equals( tagNode.getAttributeByName(attName) ) :
attValue.equalsIgnoreCase( tagNode.getAttributeByName(attName) );
}
}
}
@@ -0,0 +1,25 @@
package org.htmlcleaner.conditional;
import org.htmlcleaner.TagNode;
/**
* Remove empty autogenerated nodes. These nodes are created when an unclosed tag is immediately closed.
* @author patmoore
*
*/
public class TagNodeAutoGeneratedCondition implements ITagNodeCondition {
public static final TagNodeAutoGeneratedCondition INSTANCE = new TagNodeAutoGeneratedCondition();
/**
* @see org.htmlcleaner.conditional.ITagNodeCondition#satisfy(org.htmlcleaner.TagNode)
*/
public boolean satisfy(TagNode tagNode) {
// auto-generated node that is not needed.
return tagNode.isAutoGenerated() && tagNode.isEmpty();
}
@Override
public String toString() {
return "auto generated tagNode";
}
}
@@ -0,0 +1,94 @@
package org.htmlcleaner.conditional;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.htmlcleaner.ContentNode;
import org.htmlcleaner.ITagInfoProvider;
import org.htmlcleaner.TagInfo;
import org.htmlcleaner.TagNode;
import static org.htmlcleaner.Utils.isEmptyString;
import static org.htmlcleaner.Display.*;
/**
* Checks if node is an <b>inline</b> 0r block element and has empty contents or white/non-breakable spaces only. Nodes that have
* non-empty id attribute are considered to be non-empty, since they can be used in javascript scenarios.
*
* Examples that should be pruned,
* <pre>
* <u> </u>
* <table><tr><td></td</tr></table>
* </pre>
*
* Examples of code that should NOT be pruned:
*
* <pre>
* <p><img/></p> - no content but image tags do not have text content.
* <table<tr><td/><td>hi</td></tr> - the first (empty) td is a placeholder so the second td is in the correct column
* </pre>
* @author Konstantin Burov
*/
public class TagNodeEmptyContentCondition implements ITagNodeCondition {
private static final String ID_ATTRIBUTE_NAME = "id";
/**
* Removal of element from this set can affect layout too hard.
*/
private static final Set < String > unsafeBlockElements = new HashSet < String >();
static {
// cannot just remove a td unless removing the entire row. td's are place holders
unsafeBlockElements.add("td");
unsafeBlockElements.add("th");
}
private ITagInfoProvider tagInfoProvider;
public TagNodeEmptyContentCondition(ITagInfoProvider provider) {
this.tagInfoProvider = provider;
}
public boolean satisfy(TagNode tagNode) {
return satisfy(tagNode, false);
}
private boolean satisfy(TagNode tagNode, boolean override) {
String name = tagNode.getName();
TagInfo tagInfo = tagInfoProvider.getTagInfo(name);
//Only _block_ elements can match.
if (tagInfo != null && !hasIdAttributeSet(tagNode) && none != tagInfo.getDisplay() && !tagInfo.isEmptyTag() && (override || !unsafeBlockElements.contains(name))) {
CharSequence contentString = tagNode.getText();
if(isEmptyString(contentString)) {
// even though there may be no text need to make sure all children are empty or can be pruned
if (tagNode.isEmpty()) {
return true;
} else {
for(Object child: tagNode.getAllChildren()) {
// TODO : similar check as in tagNode.isEmpty() argues for a visitor pattern
// but allow empty td, ths to be pruned.
if ( child instanceof TagNode) {
if (!satisfy((TagNode)child, true)) {
return false;
}
} else if (child instanceof ContentNode ) {
if ( !((ContentNode)child).isBlank()) {
return false;
}
} else {
return false;
}
}
return true;
}
}
}
return false;
}
private boolean hasIdAttributeSet(TagNode tagNode) {
Map < String, String > attributes = tagNode.getAttributes();
return !isEmptyString(attributes.get(ID_ATTRIBUTE_NAME));
}
}
@@ -0,0 +1,47 @@
package org.htmlcleaner.conditional;
import java.util.List;
import org.htmlcleaner.TagNode;
/**
* Checks if node is an insignificant br tag -- is placed at the end or at the
* start of a block.
*
* @author Konstantin Burov (aectann@gmail.com)
*/
public class TagNodeInsignificantBrCondition implements ITagNodeCondition {
private static final String BR_TAG = "br";
public TagNodeInsignificantBrCondition() {
}
public boolean satisfy(TagNode tagNode) {
if (!isBrNode(tagNode)) {
return false;
}
TagNode parent = tagNode.getParent();
List children = parent.getAllChildren();
int brIndex = children.indexOf(tagNode);
return checkSublist(0, brIndex, children) || checkSublist (brIndex, children.size(), children);
}
private boolean isBrNode(TagNode tagNode) {
return tagNode != null && BR_TAG.equals(tagNode.getName());
}
private boolean checkSublist(int start, int end, List list) {
List sublist = list.subList(start, end);
for (Object object : sublist) {
if(!(object instanceof TagNode)){
return false;
}
TagNode node = (TagNode) object;
if(!isBrNode(node)&&!node.isPruned()){
return false;
}
}
return true;
}
}
@@ -0,0 +1,18 @@
package org.htmlcleaner.conditional;
import org.htmlcleaner.TagNode;
/**
* Checks if node has specified name.
*/
public class TagNodeNameCondition implements ITagNodeCondition {
private String name;
public TagNodeNameCondition(String name) {
this.name = name;
}
public boolean satisfy(TagNode tagNode) {
return tagNode == null ? false : tagNode.getName().equalsIgnoreCase(this.name);
}
}
@@ -0,0 +1,220 @@
/* Copyright (c) 2006-2013, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.jdom2.input.DOMBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.junit.Assert;
import org.junit.Before;
import org.w3c.dom.Document;
import static org.junit.Assert.assertEquals;
/**
* Abstract test class with utility methods
*/
public abstract class AbstractHtmlCleanerTest {
protected HtmlCleaner cleaner;
protected Serializer serializer;
@Before
public void setup(){
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setAdvancedXmlEscape(true);
cleanerProperties.setTranslateSpecialEntities(false);
cleanerProperties.setOmitComments(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleaner = new HtmlCleaner(cleanerProperties);
serializer = new SimpleXmlSerializer(cleanerProperties);
}
protected void assertCleaned(String initial, String expected) throws IOException {
TagNode node = cleaner.clean(initial);
StringWriter writer = new StringWriter();
serializer.write(node, writer, "UTF-8");
assertEquals(expected, writer.toString());
}
protected void assertCleanedHtml(String initial, String expected) throws IOException {
TagNode node = cleaner.clean(initial);
StringWriter writer = new StringWriter();
Serializer ser = new SimpleHtmlSerializer(cleaner.getProperties());
ser.write(node, writer, "UTF-8");
assertEquals(expected, writer.toString());
}
protected void assertCleanedDom(String initial, String expected) throws Exception {
cleaner.getProperties().setOmitHtmlEnvelope(false);
TagNode node = cleaner.clean(initial);
StringWriter writer = new StringWriter();
DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
Document document = domSerializer.createDOM(node);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.transform(new DOMSource(document), new StreamResult(writer));
String rawActual = writer.getBuffer().toString();
String[] lines = rawActual.split("\n");
StringWriter buffer = new StringWriter();
for (String line : lines) {
buffer.write(line.trim());
buffer.write("\n");
}
String actual = buffer.toString();
actual = actual.substring(actual.indexOf("<body>\n")+7, actual.indexOf("</body>")).trim();
assertEquals(expected, actual);
cleaner.getProperties().setOmitHtmlEnvelope(true);
}
protected void assertCleanedJDom(String initial, String expected) throws Exception {
boolean env = cleaner.getProperties().isOmitHtmlEnvelope();
cleaner.getProperties().setOmitHtmlEnvelope(false);
TagNode node = cleaner.clean(initial);
StringWriter writer = new StringWriter();
JDomSerializer domSerializer = new JDomSerializer(cleaner.getProperties());
org.jdom2.Document document = domSerializer.createJDom(node);
XMLOutputter out = new XMLOutputter();
out.output(document, writer);
String actual = writer.getBuffer().toString();
actual = actual.substring(actual.indexOf("<body>")+6, actual.indexOf("</body>"));
assertEquals(expected, actual);
cleaner.getProperties().setOmitHtmlEnvelope(env);
}
protected String readFile(String filename) throws IOException {
File file = new File(filename);
CharSequence content = Utils.readUrl(file.toURI().toURL(), "UTF-8");
return content.toString();
}
public static final String HEADER =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
//+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" "
//+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n";
private static final String HEADER_FULL = HEADER + "<html><head /><body>";
private static final String FOOTER = "</body></html>";
protected void assertHTML(String expected, String input) throws IOException {
StringWriter writer = new StringWriter();
serializer.write(cleaner.clean(input), writer, "UTF-8");
String actual = writer.toString();
Assert.assertEquals(HEADER_FULL + expected + FOOTER, actual);
}
protected void assertHTMLWithHeader(String expected, String input) throws IOException {
StringWriter writer = new StringWriter();
serializer.write(cleaner.clean(input), writer, "UTF-8");
String actual = writer.toString();
Assert.assertEquals(HEADER + expected, actual);
}
protected void assertHTMLUsingDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
DomSerializer ser = new DomSerializer(cleaner.getProperties());
Document document = ser.createDOM(cleaner.clean(input));
DOMBuilder in = new DOMBuilder();
org.jdom2.Document jdomDoc = in.build(document);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String actual = outputter.outputString(jdomDoc);
Assert.assertEquals(HEADER_FULL + expected + FOOTER + "\n", actual);
}
protected void assertHTMLUsingJDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
org.jdom2.Document document = ser.createJDom(cleaner.clean(input));
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String actual = outputter.outputString(document);
Assert.assertEquals(HEADER_FULL + expected + FOOTER + "\n", actual);
}
protected void assertHTMLIncludingHeaderUsingJDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
org.jdom2.Document document = ser.createJDom(cleaner.clean(input));
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String actual = outputter.outputString(document);
Assert.assertEquals(HEADER + expected + "\n", actual);
}
protected String documentToString(
final Document doc)
{
String ret = "";
final TransformerFactory tf = TransformerFactory.newInstance();
try
{
final Transformer transformer = tf.newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
transformer.setOutputProperty(OutputKeys.METHOD, "xml");
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
final StringWriter stringWriter = new StringWriter();
transformer.transform(new DOMSource(doc), new StreamResult(stringWriter));
ret = stringWriter.getBuffer().toString();
}
catch (TransformerException e)
{
System.err.println("Failed to toString document " + e);
}
return ret;
}
}
@@ -0,0 +1,37 @@
package org.htmlcleaner;
import junit.framework.TestCase;
/**
* @author patmoore
*
*/
public class BadTerminationTest extends TestCase {
public void testHandleGarbageInEndTag() throws Exception {
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitHtmlEnvelope(true);
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setUseEmptyElementTags(false);
String output = new SimpleXmlSerializer(cleanerProperties).getAsString( "<div></div id=\"foo\">");
assertEquals("<div></div>", output);
}
// public void testWhiteSpaceInTag() throws Exception {
// String s =
// "<html><body><table width=\"838\" cellpadding=\"5\" cellspacing=\"0\">\n"
// +
// " <tbody>\n" +
// " <td width=\"704\"> </td>\n" +
// " </tr\n" +
// " ></tbody>< /table></bo dy>";
// CleanerProperties cleanerProperties = new CleanerProperties();
// cleanerProperties.setOmitHtmlEnvelope(false);
// cleanerProperties.setOmitXmlDeclaration(true);
// cleanerProperties.setUseEmptyElementTags(false);
// String output = new
// SimpleXmlSerializer().getXmlAsString(cleanerProperties, s, "UTF-8");
// assertEquals("<html><head></head><body><table width=\"838\" cellpadding=\"5\" cellspacing=\"0\"><tbody><tr><td width=\"704\"> </td></tr></tbody></table></body></html>",output);
// }
}
@@ -0,0 +1,88 @@
package org.htmlcleaner;
import java.io.*;
import junit.framework.*;
/**
* Test cases for for {@link BrowserCompactXmlSerializer}
*
* @author Konstantin Burov (aectann@gmail.com)
*
*/
public class BrowserCompactXmlSerializerTest extends TestCase {
private BrowserCompactXmlSerializer compactXmlSerializer;
private CleanerProperties properties;
@Override
protected void setUp() throws Exception {
properties = new CleanerProperties();
properties.setOmitHtmlEnvelope(true);
properties.setOmitXmlDeclaration(true);
compactXmlSerializer = new BrowserCompactXmlSerializer(properties);
}
public void testInlineWhitespaceHandling(){
String cleaned = compactXmlSerializer.getAsString("<p>Test1 <a href=\"somelink\">Linktext</a> Test2</p>");
assertEquals("<p>Test1 <a href=\"somelink\">Linktext</a> Test2</p>\n", cleaned);
cleaned = compactXmlSerializer.getAsString("<p>Test1<a href=\"somelink\">Linktext</a>Test2</p>");
assertEquals("<p>Test1<a href=\"somelink\">Linktext</a>Test2</p>\n", cleaned);
cleaned = compactXmlSerializer.getAsString("one<br><b>two</b></br>three<b>four</b>");
assertEquals("one<br /><b>two</b>three<b>four</b>", cleaned);
cleaned = compactXmlSerializer.getAsString("one<br><b>two</b></br>three <b>four</b>");
assertEquals("one<br /><b>two</b>three <b>four</b>", cleaned);
}
/**
* Tests that serializer removes white spaces properly.
* @throws IOException
*/
public void testRemoveInsignificantWhitespaces() throws IOException{
String cleaned = compactXmlSerializer.getAsString( " <u>text here, </u><b>some text</b> ");
assertEquals("<u>text here, </u><b>some text</b>", cleaned);
cleaned = compactXmlSerializer.getAsString( " <div class=\"foo\">2 roots < here > </div>");
assertEquals("<div class=\"foo\">2 roots &lt; here &gt;</div>\n", cleaned);
cleaned = compactXmlSerializer.getAsString( " <div class=\"foo\">2 roots \n < here > </div>");
assertEquals("<div class=\"foo\">2 roots &lt; here &gt;</div>\n", cleaned);
cleaned = compactXmlSerializer.getAsString( " <div class=\"foo\">2 roots \n\n < here > </div>");
assertEquals("<div class=\"foo\">2 roots <br />&lt; here &gt;</div>\n", cleaned);
}
/**
* Non-breakable spaces also must be removed from start and end.
* @throws IOException
*/
public void testRemoveLeadingAndEndingNbsp() throws IOException {
String cleaned = compactXmlSerializer.getAsString(
"&nbsp;&nbsp;We have just released Jericho Road. Listen to Still Waters the lead-off track.");
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
cleaned = compactXmlSerializer.getAsString(
"&#160;We have just released Jericho Road. Listen to Still Waters the lead-off track.&#160;");
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
cleaned = compactXmlSerializer.getAsString(
"&#xA0;We have just released Jericho Road. Listen to Still Waters the lead-off track.&#xA0;");
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
cleaned = compactXmlSerializer.getAsString( SpecialEntities.NON_BREAKABLE_SPACE
+ "We have just released Jericho Road. Listen to Still Waters the lead-off track.&#xA0;"
+ SpecialEntities.NON_BREAKABLE_SPACE);
assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
}
/**
* Tests that contents of 'pre' tag are untouched.
* @throws IOException
*/
public void testPreTagIsUntouched() throws IOException{
String cleaned = compactXmlSerializer.getAsString( " <pre>some text</pre>");
assertEquals("<pre>some text</pre>\n", cleaned);
cleaned = compactXmlSerializer.getAsString( "<pre> some text</pre>");
assertEquals("<pre> some text</pre>\n", cleaned);
cleaned = compactXmlSerializer.getAsString( "<pre>some /n/n text</pre>");
assertEquals("<pre>some /n/n text</pre>\n", cleaned);
}
}
@@ -0,0 +1,604 @@
/* Copyright (c) 2006-2013, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.junit.Ignore;
import org.junit.Test;
public class CDATATest extends AbstractHtmlCleanerTest {
/**
* Test for bug #189
* @throws Exception
*/
@Test
public void UnclosedCDATA() throws Exception{
String html = "<script><![CDATA[";
String x = "";
for (int i = 0; i < 2048; i++){x+="x";};
html += x;
html += "</script><p>Test</p>";
String expected = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script><p>Test</p>";
cleaner.getProperties().setOmitHtmlEnvelope(true);
assertCleaned(html, expected);
cleaner.getProperties().setOmitHtmlEnvelope(false);
}
/**
* Test for bug #211
* This passes, but is marked @Ignore because it takes a while to run. Comment
* out ignore and run this test before making any builds.
* @throws Exception
*/
@Ignore
@Test
public void UnclosedCDATA2() throws Exception{
String html = "<script><![CDATA[";
String x = "";
for (int i = 0; i < 513*1024; i++){x+="x";};
html += x;
html += "</script><p>Test</p>";
String expected = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script><p>Test</p>";
cleaner.getProperties().setOmitHtmlEnvelope(true);
assertCleaned(html, expected);
cleaner.getProperties().setOmitHtmlEnvelope(false);
}
//
// Test for bug #185
//
@Test
public void noEndTokenLong() throws Exception{
String initial = "<script><![CDATA[";
String x = "";
for (int i = 0; i < 1024; i++){x+="x";};
String expected = "<script>"+x+"</script>";
String expectedXml = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script>";
cleaner.getProperties().setOmitHtmlEnvelope(true);
assertCleanedHtml(initial+x, expected);
assertCleaned(initial+x, expectedXml);
assertCleanedJDom(initial+x, expectedXml);
cleaner.getProperties().setOmitHtmlEnvelope(false);
}
//
// Test for bug #189
//
@Test
public void noEndTokenReallyLong() throws Exception{
String initial = "<script><![CDATA[";
String x = "";
for (int i = 0; i < 4096; i++){x+="x";};
String expected = "<script>"+x+"</script>";
String expectedXml = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script>";
cleaner.getProperties().setOmitHtmlEnvelope(true);
assertCleanedHtml(initial+x, expected);
assertCleaned(initial+x, expectedXml);
assertCleanedJDom(initial+x, expectedXml);
cleaner.getProperties().setOmitHtmlEnvelope(false);
}
/**
* This is to test issue #134
* @throws IOException
*/
@Test
public void strayEndTagInCDATA() throws IOException{
String initial = readFile("src/test/resources/test31.html");
cleaner.clean(initial);
}
/**
* Tests that we escape CDATA in regular HTML content
* @throws IOException
*/
@Test
public void NotReallyCData() throws IOException{
String initial = "<p><![CDATA ]]> is sometimes used in XML";
String expected = "<html>\n<head />\n<body><p>&lt;![CDATA ]]&gt; is sometimes used in XML</p></body></html>";
assertCleaned(initial, expected);
}
/**
* This is a simple no-op test; when we use a HTML serializer we don't
* automatically wrap the contents of script tags in a CDATA, as we do with
* the XML serializers
*
* @throws IOException
*/
@Test
public void NoCData() throws IOException{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setOmitDoctypeDeclaration(true);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleanerProperties.setUseCdataForScriptAndStyle(true);
this.cleaner = new HtmlCleaner(cleanerProperties);
this.serializer = new SimpleHtmlSerializer(cleaner.getProperties());
String initial = "<html><head><script>function testNoOp(){<>}</script></head><body></body></html>";
String expected = initial;
assertCleaned(initial, expected);
}
/**
* In this test the script has no CDATA, an unescaped CDATAsection in a
* script tag, and there is also an incorrect CDATA declaration in a
* paragraph tag.
*
* @throws IOException
*/
@Test
public void CDATAmixed() throws IOException{
String initial = readFile("src/test/resources/test11.html");
String expected = readFile("src/test/resources/test11_expected.html");
assertCleaned(initial, expected);
}
@Test
public void CDATAandDocType() throws IOException{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
this.cleaner = new HtmlCleaner(cleanerProperties);
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
String initial = readFile("src/test/resources/test12.html");
String expected = readFile("src/test/resources/test12_expected.html");
assertCleaned(initial, expected);
}
@Test
public void scriptAndCData() throws IOException
{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleanerProperties.setAddNewlineToHeadAndBody(false);
cleanerProperties.setUseCdataFor("script,style,altscript");
this.cleaner = new HtmlCleaner(cleanerProperties);
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
assertHTMLWithHeader(
"<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
"<style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style>");
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
"<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n"
+ "// \n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "// \n/*]]>*/"
+ "</script>", "<script type=\"text/javascript\">\n"
+ "// <![CDATA[\n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "// ]]>\n"
+ "</script>");
assertHTML("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
assertHTML("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
assertHTML(
"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
"<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
);
}
@Test
public void scriptAndCDataDom() throws IOException, ParserConfigurationException, Exception
{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleanerProperties.setAddNewlineToHeadAndBody(false);
cleanerProperties.setUseCdataFor("script,style,altscript");
this.cleaner = new HtmlCleaner(cleanerProperties);
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
assertHTMLIncludingHeaderUsingJDomSerializer(
"<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
"<html><head><style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style></head></html>"
);
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
"<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n"
+ "// \n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "// \n/*]]>*/"
+ "</script>", "<script type=\"text/javascript\">\n"
+ "// <![CDATA[\n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "// ]]>\n"
+ "</script>");
assertHTMLUsingDomSerializer("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
assertHTMLUsingDomSerializer("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
assertHTMLUsingDomSerializer(
"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
"<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
);
}
@Test
public void scriptAndCDataJDom() throws IOException, ParserConfigurationException
{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleanerProperties.setAddNewlineToHeadAndBody(false);
cleanerProperties.setUseCdataFor("script,style,altscript");
this.cleaner = new HtmlCleaner(cleanerProperties);
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
assertHTMLIncludingHeaderUsingJDomSerializer("<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
"<style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style>");
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
"<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n"
+ "// \n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "// \n/*]]>*/"
+ "</script>", "<script type=\"text/javascript\">\n"
+ "// <![CDATA[\n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "// ]]>\n"
+ "</script>");
assertHTMLUsingJDomSerializer("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
assertHTMLUsingJDomSerializer("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
assertHTMLUsingJDomSerializer(
"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
"<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
);
}
@Test
public void escapingCDATA() throws IOException{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleanerProperties.setAdvancedXmlEscape(true);
cleanerProperties.setAddNewlineToHeadAndBody(false);
cleanerProperties.setDeserializeEntities(true);
cleanerProperties.setUseCdataFor("script,style,altscript");
this.cleaner = new HtmlCleaner(cleanerProperties);
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
assertHTML("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script>&lt;&gt;</script>");
assertHTML("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript>&lt;&gt;</altscript>");
}
@Test
public void removeCDATA() throws IOException{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitCdataOutsideScriptAndStyle(true);
cleanerProperties.setAddNewlineToHeadAndBody(false);
cleanerProperties.setUseCdataFor("script,style,altscript");
cleaner = new HtmlCleaner(cleanerProperties);
serializer = new SimpleXmlSerializer(cleaner.getProperties());
// Verify that CDATA not inside SCRIPT or STYLE elements are considered comments in HTML and thus stripped
// when cleaned.
assertHTML("<p></p>", "<p><![CDATA[&]]></p>");
assertHTML("<p>&amp;&amp;</p>", "<p>&<![CDATA[&]]>&</p>");
assertHTML("<noaltscript />", "<noaltscript><![CDATA[&]]></noaltscript>");
}
/**
* Using the default setup, we should strip out CData outside
* of script and style tags.
*/
@Test
public void CDATAinthewrongplace(){
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setIgnoreQuestAndExclam(true);
cleaner = new HtmlCleaner(cleanerProperties);
String testData = ""
+ "<p>"
+ "<![CDATA[\n"
+ "function helloWorld() {\n"
+ "};\n"
+ "]]>\n"
+ "</p>";
TagNode cleaned = cleaner.clean(testData);
TagNode p = cleaned.findElementByName("p", true);
//
// We should have no CData nodes, instead the contents should
// be processed as content and escaped as usual
//
assertTrue(p.getAllChildren().get(0) instanceof ContentNode);
}
@Test
public void nonSafeCDATA(){
String testData = ""
+ "<script type=\"text/javascript\">"
+ "<![CDATA[\n"
+ "function helloWorld() {\n"
+ "};\n"
+ "]]>\n"
+ "</script>";
TagNode cleaned = cleaner.clean(testData);
TagNode script = cleaned.findElementByName("script", true);
//
// We should have a CData node for the CDATA section
//
assertTrue(script.getAllChildren().get(0) instanceof CData);
CData cdata = (CData)script.getAllChildren().get(0);
String content = cdata.getContentWithoutStartAndEndTokens();
assertEquals("\nfunction helloWorld() {\n};\n", content);
}
@Test
public void safeOutput(){
String testData = ""
+ "<script type=\"text/javascript\">"
+ "<![CDATA[\n"
+ "function helloWorld() {\n"
+ "};\n"
+ "]]>\n"
+ "</script>";
TagNode cleaned = cleaner.clean(testData);
TagNode script = cleaned.findElementByName("script", true);
//
// We should have a CData node for the CDATA section
//
assertTrue(script.getAllChildren().get(0) instanceof CData);
CData cdata = (CData)script.getAllChildren().get(0);
String content = cdata.getContentWithoutStartAndEndTokens();
assertEquals("\nfunction helloWorld() {\n};\n", content);
String safeContent = cdata.getContentWithStartAndEndTokens();
assertEquals("/*<![CDATA[*/\nfunction helloWorld() {\n};\n/*]]>*/", safeContent);
}
/**
* For a CDATA section we need to ignore '<' and '>' and keep going to keep the content
* within a single CData instance.
*/
@Test
public void safeCDATAAlternate(){
String testData = ""
+ "<script type=\"text/javascript\">\n"
+ "//<![CDATA[\n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "//]]>\n"
+ "</script>";
TagNode cleaned = cleaner.clean(testData);
TagNode script = cleaned.findElementByName("script", true);
//
// We should have a CData node for the CDATA section
//
assertTrue(script.getAllChildren().get(1) instanceof CData);
CData cdata = (CData)script.getAllChildren().get(1);
String content = cdata.getContentWithoutStartAndEndTokens();
assertEquals("\nfunction escapeForXML(origtext) {\n return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n .replace(/>/g,'&'+'gt;').replace(/'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');}\n", content);
}
/**
* For a CDATA section we need to ignore '<' and '>' and keep going to keep the content
* within a single CData instance
*/
@Test
public void safeCDATA(){
String testData = ""
+ "<script type=\"text/javascript\">\n"
+ "/*<![CDATA[*/\n"
+ "function escapeForXML(origtext) {\n"
+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+ "}\n"
+ "/*]]>*/>\n"
+ "</script>";
TagNode cleaned = cleaner.clean(testData);
TagNode script = cleaned.findElementByName("script", true);
//
// We should have a CData node for the CDATA section
//
assertTrue(script.getAllChildren().get(1) instanceof CData);
CData cdata = (CData)script.getAllChildren().get(1);
String content = cdata.getContentWithoutStartAndEndTokens();
assertEquals("\nfunction escapeForXML(origtext) {\n return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n .replace(/>/g,'&'+'gt;').replace(/'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');}\n", content);
}
@Test
public void style(){
String testData = "<style type=\"text/css\">/*<![CDATA[*/\n#ampmep_188 { }\n/*]]>*/</style>";
TagNode cleaned = cleaner.clean(testData);
TagNode style = cleaned.findElementByName("style", true);
assertTrue(style.getAllChildren().get(0) instanceof CData);
String content = (((CData)style.getAllChildren().get(0)).getContentWithoutStartAndEndTokens());
assertEquals("\n#ampmep_188 { }\n", content);
}
@Test
public void preserveComments() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
String initial = readFile("src/test/resources/test17.html");
String expected = readFile("src/test/resources/test17_expected.html");
assertCleaned(initial, expected);
}
@Test
public void preserveCommentsXwiki() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
cleaner.getProperties().setAddNewlineToHeadAndBody(false);
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
"<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>"
);
}
@Test
public void preserveComments2() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
cleaner.getProperties().setAddNewlineToHeadAndBody(false);
assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//alert(\"Hello World\")\n/*]]>*/</script>",
"<script type=\"text/javascript\"><![CDATA[//alert(\"Hello World\")]]></script>"
);
}
}
@@ -0,0 +1,125 @@
package org.htmlcleaner;
import java.io.IOException;
import org.junit.Test;
import junit.framework.TestCase;
/**
* Tests that tag closed due to one of its children (when the child tag is not allowed to be inside parent) is then
* reopened.
* Examples:
* <pre>
* <div><p>text1<table><tr><td>text2</td></tr></table>text3</p></div>
* </pre>
* table is not allowed inside a <p> most browsers handle this by placing the table close to line before and line after and in general allowing it.
*
* Cleaning here normally would result in :
* <pre>
* <div><p>text1<table><tr><td>text2</td></tr></table>text3</div>
* </pre>
* 'text3' is no longer inside the original element type ( 'p' ). Instead 'text3' is now within a 'div'.
* text3 would no longer be styled correctly.
*
* A more correct result is:
* <pre>
* <div><p>text1<table><tr><td>text2</td></tr></table><p>text3</p></div>
* </pre>
*/
public class ClosedTagReopenTest extends TestCase {
public void testSimpleHTML4() throws IOException {
CleanerProperties properties = new CleanerProperties();
properties.setHtmlVersion(HtmlCleaner.HTML_4);
properties.setOmitXmlDeclaration(true);
properties.setOmitHtmlEnvelope(true);
SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
String[][] tests= {
new String[] { "<p>text1<table><tr><td>text2</td></tr></table>text3</p>", "<p>text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p>text3</p>" },
new String[] {"</p>text1","text1"},
new String[] {"<p>text1<div>text2</div>text3</p>", "<p>text1</p><div>text2</div><p>text3</p>"},
new String[] { "<div>text1<p>text2</p>text3</div>", "<div>text1<p>text2</p>text3</div>"},
new String[] {"<font>text1<p>text2</p>text3</font>", "<font>text1</font><p><font>text2</font></p><font>text3</font>"},
new String[] {"<p>text1<div>text2</div>text3<div>text4</div></p>", "<p>text1</p><div>text2</div><p>text3</p><div>text4</div>"},
new String[] {"<p>text1<div>text2</div></p>", "<p>text1</p><div>text2</div>"},
new String[] {"<p>text1<p>text2</p></p>", "<p>text1</p><p>text2</p>"},
//test multiple internal breaks
new String[] {"<p><div>text1<p>text2<div>text3<p>text4<div>text5</div></p></div></p></div>","<p></p><div>text1<p>text2</p><div>text3<p>text4</p><div>text5</div></div></div>"},
// test attribute preservation
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2</td></tr></table>text3</p>",
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
// but not all attributes ( id attribute must be unique )
// TODO: maybe a generated id so that correlation can be found?
new String[] { "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1<table><tr><td>text2</td></tr></table>text3</p>",
"<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
// test multiple replacements
// test to see if nested good <p> can be handled.
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p>text2a</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p>text2a</p></td></tr></tbody></table>" +
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
"<ul><li>text4</li></ul>" +
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
"<ul><li>text6</li></ul>" },
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p class=\"another_p_element\">text2a<div>test2b</div>test2c</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p class=\"another_p_element\">text2a</p><div>test2b</div><p class=\"another_p_element\">test2c</p></td></tr></tbody></table>" +
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
"<ul><li>text4</li></ul>" +
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
"<ul><li>text6</li></ul>" },
new String[]{"<p>text1<table><tr><td>text2<tr><td>text3</table>text4</p>","<p>text1</p><table><tbody><tr><td>text2</td></tr><tr><td>text3</td></tr></tbody></table><p>text4</p>"}
};
for(String[] test: tests) {
String cleaned = serializer.getAsString(test[0]);
assertEquals("started with="+test[0], test[1], cleaned);
}
}
@Test
public void testSimpleHTML5() throws IOException {
CleanerProperties properties = new CleanerProperties();
properties.setHtmlVersion(HtmlCleaner.HTML_5);
properties.setOmitXmlDeclaration(true);
properties.setOmitHtmlEnvelope(true);
SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
String[][] tests= {
new String[] { "<p>text1<table><tr><td>text2</td></tr></table>text3</p>", "<p>text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p>text3</p>" },
new String[] {"</p>text1","text1"},
new String[] {"<p>text1<div>text2</div>text3</p>", "<p>text1</p><div>text2</div><p>text3</p>"},
new String[] { "<div>text1<p>text2</p>text3</div>", "<div>text1<p>text2</p>text3</div>"},
new String[] {"text1<p>text2</p>text3", "text1<p>text2</p>text3"},
new String[] {"<p>text1<div>text2</div>text3<div>text4</div></p>", "<p>text1</p><div>text2</div><p>text3</p><div>text4</div>"},
new String[] {"<p>text1<div>text2</div></p>", "<p>text1</p><div>text2</div>"},
new String[] {"<p>text1<p>text2</p></p>", "<p>text1</p><p>text2</p>"},
//test multiple internal breaks
new String[] {"<p><div>text1<p>text2<div>text3<p>text4<div>text5</div></p></div></p></div>","<p></p><div>text1<p>text2</p><div>text3<p>text4</p><div>text5</div></div></div>"},
// test attribute preservation
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2</td></tr></table>text3</p>",
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
// but not all attributes ( id attribute must be unique )
// TODO: maybe a generated id so that correlation can be found?
new String[] { "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1<table><tr><td>text2</td></tr></table>text3</p>",
"<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
// test multiple replacements
// test to see if nested good <p> can be handled.
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p>text2a</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p>text2a</p></td></tr></tbody></table>" +
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
"<ul><li>text4</li></ul>" +
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
"<ul><li>text6</li></ul>" },
new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p class=\"another_p_element\">text2a<div>test2b</div>test2c</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>",
"<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p class=\"another_p_element\">text2a</p><div>test2b</div><p class=\"another_p_element\">test2c</p></td></tr></tbody></table>" +
"<p class=\"p_class\" random=\"attribute\">text3</p>" +
"<ul><li>text4</li></ul>" +
"<p class=\"p_class\" random=\"attribute\">text5</p>" +
"<ul><li>text6</li></ul>" },
new String[]{"<p>text1<table><tr><td>text2<tr><td>text3</table>text4</p>","<p>text1</p><table><tbody><tr><td>text2</td></tr><tr><td>text3</td></tr></tbody></table><p>text4</p>"}
};
for(String[] test: tests) {
String cleaned = serializer.getAsString(test[0]);
assertEquals("started with="+test[0], test[1], cleaned);
}
}
}
@@ -0,0 +1,215 @@
package org.htmlcleaner;
import java.io.IOException;
import org.htmlcleaner.conditional.TagNodeEmptyContentCondition;
import org.htmlcleaner.conditional.TagNodeInsignificantBrCondition;
import junit.framework.TestCase;
/**
* Various tests for collapseNullHtml mode.
*/
public class CollapseHtmlTest extends TestCase {
/**
*
*/
private static final String CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR = "<tr><td></td><td>Cannot eliminate anything in this row</td></tr>";
/**
*
*/
private static final String IMAGE = "<img src=\"http://localhost:8080/img/foo.jpg\" />";
/**
*
*/
private static final String DONT_COLLAPSE = "<span>" + IMAGE + "</span>" + "<p>" + IMAGE + "</p>"
+ "<p>bar<table><tr><td></td><td>" + IMAGE + "</td><td> </td></tr></table>foo</p>";
private static final String DONT_COLLAPSE_OUTPUT = "<span>" + IMAGE + "</span>" + "<p>" + IMAGE + "</p>"
+ "<p>bar</p><table><tbody><tr><td></td><td>" + IMAGE + "</td><td> </td></tr></tbody></table><p>foo</p>";
private HtmlCleaner cleaner;
private CleanerProperties properties;
private SimpleXmlSerializer serializer;
@Override
protected void setUp() throws Exception {
cleaner = new HtmlCleaner();
properties = cleaner.getProperties();
properties.setOmitHtmlEnvelope(true);
properties.setOmitXmlDeclaration(true);
serializer = new SimpleXmlSerializer(properties);
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
properties.addPruneTagNodeCondition(new TagNodeInsignificantBrCondition());
}
/**
* Make sure that single empty tag is dropped out.
*
* @throws IOException
*/
public void testCollapseSingleEmptyTag() throws IOException {
TagNode collapsed = cleaner.clean("<u></u>");
assertEquals("", serializer.getAsString(collapsed));
}
/**
* Make sure that tags with internal blanks are collapsed.
*/
public void testCollapseSingleTagWithBlanks() throws IOException {
TagNode collapsed = cleaner.clean("<u> </u>");
assertEquals("", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<u> &#x20; </u>");
assertEquals("", serializer.getAsString(collapsed));
// Strange msword insert
// collapsed =
// cleaner.clean("<span style='mso-spacerun:yes'>  </span>");
// assertEquals("", serializer.getAsString(collapsed));
}
/**
* make sure that non-breaking spaces are also collapsed away.
*/
public void testCollapseSingleTagWithNbsp() throws IOException {
TagNode collapsed = cleaner.clean("<u> &nbsp; </u>");
assertEquals("", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<u> &#160; </u>");
assertEquals("", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<u> &#xA0; </u>");
assertEquals("", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<u> " + SpecialEntities.NON_BREAKABLE_SPACE + " </u>");
assertEquals("", serializer.getAsString(collapsed));
}
/**
* make sure that multiple null tags are collapsed.
*/
public void testCollapseMultipleEmptyTags() throws IOException {
TagNode collapsed = cleaner.clean("<b><i><u></u></i></b>");
assertEquals("", serializer.getAsString(collapsed));
// test with slightly bad html.
collapsed = cleaner.clean("<b><i><u></i></u></b>");
assertEquals("", serializer.getAsString(collapsed));
// test with slightly bad html.
collapsed = cleaner.clean("<b><i><u></i></u>notme</b>");
assertEquals("<b>notme</b>", serializer.getAsString(collapsed));
}
/**
* make sure that insignificant br tags are collapsed
*/
public void testCollapseInsignificantBr() throws IOException {
TagNode collapsed = cleaner.clean("<p><br/>Some text</p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<p>Some text<BR/></p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<p><br/>Some<br/> text<br/></p>");
assertEquals("<p>Some<br /> text</p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<p><br/><br/>Some text <i>look here</i></p>");
assertEquals("<p>Some text <i>look here</i></p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("Some text<BR/>");
assertEquals("Some text", serializer.getAsString(collapsed));
}
/**
* make sure TagTransformations do not interfere with collapse
*/
public void testCollapseEmptyWithTagTransformations() throws IOException {
CleanerTransformations transformations = properties.getCleanerTransformations();
TagTransformation t = new TagTransformation("font", "span", true);
t.addAttributeTransformation("style", "${style};font-family:${face};font-size:${size};color:${color};");
t.addAttributeTransformation("face");
t.addAttributeTransformation("size");
t.addAttributeTransformation("color");
t.addAttributeTransformation("name", "${face}_1");
transformations.addTransformation(t);
TagNode collapsed = cleaner.clean("<b><font face=\"Ariel\"><u></u></font></b>");
assertEquals("", serializer.getAsString(collapsed));
}
/**
* test to make sure that multiple <br>
* elements are eliminated
*/
public void testChainCollapseInsignificantBrs() throws IOException {
TagNode collapsed = cleaner.clean("<p><br/><br>Some<br>text<br/><br><br></p>");
assertEquals("<p>Some<br />text</p>", serializer.getAsString(collapsed));
}
/**
* make sure that intervening empty elements still cause unneeded <br>
* s to be eliminated.
*/
public void testCollapseInsignificantBrWithEmptyElementsHTML4() throws IOException {
properties.setHtmlVersion(HtmlCleaner.HTML_4);
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
TagNode collapsed = cleaner.clean("<p><span>&nbsp;</span><br/>Some text</p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u><big></big></u><BR/></p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u><big></big></u><BR/><u></u></p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
}
public void testCollapseInsignificantBrWithEmptyElementsHTML5() throws IOException {
properties.setHtmlVersion(HtmlCleaner.HTML_5);
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
TagNode collapsed = cleaner.clean("<p><span>&nbsp;</span><br/>Some text</p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u></u><BR/></p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u></u><BR/><u></u></p>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
}
/**
* Br nested in formating elements should be eliminated.
*/
public void testInsureMeaninglessBrsStillCollapseEmptyElementsHTML4() throws IOException {
properties.setHtmlVersion(HtmlCleaner.HTML_4);
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
TagNode collapsed;
collapsed = cleaner.clean("<p><u><br/></u>Some text<br><span><BR/><u><big><BR/></big></u></p></span>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
}
public void testInsureMeaninglessBrsStillCollapseEmptyElementsHTML5() throws IOException {
properties.setHtmlVersion(HtmlCleaner.HTML_5);
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
TagNode collapsed;
collapsed = cleaner.clean("<p><u><br/></u>Some text<br><span><BR/><u><BR/></u></p></span>");
assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
}
/**
* because elements with ids can be referred to by javascript, don't assume
* that such elements can be eliminated.
*/
public void testCollapseOnlyFormattingElementsWithNoIds() throws IOException {
TagNode collapsed = cleaner.clean("<b id=\"notme\"></b><span></span><span id=\"norme\"></span>");
assertEquals("<b id=\"notme\"></b><span id=\"norme\"></span>", serializer.getAsString(collapsed));
collapsed = cleaner.clean("<b iD=\"notme\"></b><span></span><span ID=\"norme\"></span>");
assertEquals("<b id=\"notme\"></b><span id=\"norme\"></span>", serializer.getAsString(collapsed));
}
public void testCollapseAggressively() throws IOException {
properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
TagNode collapsed;
collapsed = cleaner.clean("<p><table><tr></tr><tr><td></td></tr></table></p>");
assertEquals("", serializer.getAsString(collapsed));
collapsed = cleaner.clean(DONT_COLLAPSE);
assertEquals(DONT_COLLAPSE_OUTPUT, serializer.getAsString(collapsed));
collapsed = cleaner
.clean("<p id=\"notme\"></p><table><tr></tr><tr><td>Nor me</td></tr><tr><td></td></tr><tr> </tr>"
+ "<tr>&nbsp;\n</tr>" + CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR + "</table>");
assertEquals("<p id=\"notme\"></p><table><tbody><tr><td>Nor me</td></tr>"
+ CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR + "</tbody></table>", serializer.getAsString(collapsed));
}
}
@@ -0,0 +1,34 @@
package org.htmlcleaner;
import junit.framework.TestCase;
import java.io.ByteArrayInputStream;
/**
* Testing HtmlCleaner constructors.
*/
public class ConstructorTest extends TestCase {
public void testPropertiesConstructor() throws Exception {
CleanerProperties props = new CleanerProperties();
props.setOmitComments(true);
HtmlCleaner cleaner1 = new HtmlCleaner(props);
TagNode node1 = cleaner1.clean("<a href=index.htm><b><!--COMMENT 1--><b>text text<body>");
assertTrue( new SimpleXmlSerializer(props).getAsString(node1).indexOf("<!--COMMENT 1-->") < 0 );
HtmlCleaner cleaner2 = new HtmlCleaner(props);
TagNode node2 = cleaner2.clean("<span href=index1.htm><b><!--COMMENT 2--><x>DDDD text<body>");
assertTrue( new SimpleXmlSerializer(props).getAsString(node2).indexOf("<!--COMMENT 2-->") < 0 );
HtmlCleaner cleaner3 = new HtmlCleaner(props);
props.setOmitComments(false);
TagNode node3 = cleaner3.clean("<a href=index3.htm><b><!--COMMENT 3--><x>EEEEEEE text<body>");
assertTrue( new SimpleXmlSerializer(props).getAsString(node3).indexOf("<!--COMMENT 3-->") > 0 );
TagNode node4 = cleaner3.clean( new ByteArrayInputStream( ("FIRST" + (char)0x2 + (char)0x3 + "SECOND").getBytes() ), "ASCII" );
assertTrue( new CompactXmlSerializer(props).getAsString(node4).indexOf("FIRST SECOND") >= 0 );
}
}
@@ -0,0 +1,453 @@
/* Copyright (c) 2006-2013, HtmlCleaner project team (Vladimir Nikic, Scott Wilson, Pat Moore)
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
You can contact Vladimir Nikic by sending e-mail to
nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
subject line.
*/
package org.htmlcleaner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.junit.Test;
import org.w3c.dom.Document;
public class DocTypesTest extends AbstractHtmlCleanerTest{
@Test
public void DocTypeUsingDom() throws IOException, ParserConfigurationException{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleaner = new HtmlCleaner(cleanerProperties);
DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
String initial = readFile("src/test/resources/test12.html");
TagNode cleaned = cleaner.clean(initial);
Document doc = domSerializer.createDOM(cleaned);
assertEquals("html", doc.getDoctype().getName());
assertEquals("-//W3C//DTD XHTML 1.0 Strict//EN", doc.getDoctype().getPublicId());
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", doc.getDoctype().getSystemId());
}
// TODO remove and make this class a subclass of AbstractHtmlCleanerTest
protected String readFile(String filename) throws IOException {
File file = new File(filename);
CharSequence content = Utils.readUrl(file.toURI().toURL(), "UTF-8");
return content.toString();
}
@Test
public void none() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE><html><body></body></html>");
assertEquals(null, cleaned.getDocType().getPart1());
assertEquals(null, cleaned.getDocType().getPart2());
assertEquals("", cleaned.getDocType().getPublicId());
assertEquals("", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
serializer = new SimpleHtmlSerializer(cleaner.getProperties());
String out = serializer.getAsString(cleaned);
assertEquals(out, "<!DOCTYPE>\n<html><head></head><body></body></html>");
}
//
// Check all the valid doctypes
//
@Test
public void html_5() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
assertEquals("html", cleaned.getDocType().getPart1());
assertEquals(null, cleaned.getDocType().getPart2());
assertEquals("", cleaned.getDocType().getPublicId());
assertEquals("", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML5, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_5_upper() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals(null, cleaned.getDocType().getPart2());
assertEquals("", cleaned.getDocType().getPublicId());
assertEquals("", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML5, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_5_legacy() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM \"about:legacy-compat\"><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("SYSTEM", cleaned.getDocType().getPart2());
assertEquals("about:legacy-compat", cleaned.getDocType().getPublicId());
assertEquals("", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML5_LEGACY_TOOL_COMPATIBLE, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_5_legacy_alternate() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM 'about:legacy-compat'><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("SYSTEM", cleaned.getDocType().getPart2());
assertEquals("about:legacy-compat", cleaned.getDocType().getPublicId());
assertEquals("", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML5_LEGACY_TOOL_COMPATIBLE, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_4_0() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD HTML 4.0//EN", cleaned.getDocType().getPublicId());
assertEquals("", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_4_0_strict() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\"><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD HTML 4.0//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/REC-html40/strict.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_strict_identifierOnly() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
assertEquals("", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_strict_mixed() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\" SYSTEM \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
assertEquals("html", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/html4/strict.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_strict() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/html4/strict.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_transitional() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD HTML 4.01 Transitional//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/html4/loose.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML4_01_TRANSITIONAL, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_frameset() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\"><html><body></body></html>");
assertEquals("HTML", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD HTML 4.01 Frameset//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/html4/frameset.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.HTML4_01_FRAMESET, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_strict() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
assertEquals("html", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD XHTML 1.0 Strict//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.XHTML1_0_STRICT, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_transitional() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html><body></body></html>");
assertEquals("html", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD XHTML 1.0 Transitional//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.XHTML1_0_TRANSITIONAL, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_frameset() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Frameset//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd\"><html><body></body></html>");
assertEquals("html", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD XHTML 1.0 Frameset//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.XHTML1_0_FRAMESET, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_1() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html><body></body></html>");
assertEquals("html", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD XHTML 1.1//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_1_basic() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
assertEquals("html", cleaned.getDocType().getPart1());
assertEquals("PUBLIC", cleaned.getDocType().getPart2());
assertEquals("-//W3C//DTD XHTML Basic 1.1//EN", cleaned.getDocType().getPublicId());
assertEquals("http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd", cleaned.getDocType().getSystemId());
assertEquals(DoctypeToken.XHTML1_1_BASIC, cleaned.getDocType().getType());
assertTrue(cleaned.getDocType().isValid());
}
//
// Now some invalid ones
//
@Test
public void empty() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE><html><body></body></html>");
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void not_html() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE banana><html><body></body></html>");
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void html_4_0_wrong_id_type() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM \"-//W3C//DTD HTML 4.0//EN\"><html><body></body></html>");
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void html_4_0_wrong_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_wrong_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_transitional_bad_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
assertEquals(DoctypeToken.HTML4_01_TRANSITIONAL, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void html_4_01_frameset_bad_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\"><html><body></body></html>");
assertEquals(DoctypeToken.HTML4_01_FRAMESET, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_0_with_wrong_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
assertEquals(DoctypeToken.XHTML1_0_STRICT, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_0_transitional_with_wrong_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"><html><body></body></html>");
assertEquals(DoctypeToken.XHTML1_0_TRANSITIONAL, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_0_frameset_with_wrong_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Frameset//EN\"><html><body></body></html>");
assertEquals(DoctypeToken.XHTML1_0_FRAMESET, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_1_with_wrong_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void xhtml_1_1_with_no_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"><html><body></body></html>");
assertFalse(cleaned.getDocType().isValid());
assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
}
@Test
public void xhtml_1_1_basic_with_no_id() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
assertEquals(DoctypeToken.XHTML1_1_BASIC, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
@Test
public void weird_token() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html SILLY \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
assertFalse(cleaned.getDocType().isValid());
}
//
// Serializer
//
@Test
public void html_4_01_serialize() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
String output = serializer.getAsString(cleaned);
assertTrue(output.startsWith("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"));
}
@Test
public void html_4_01_domserialize() throws IOException, ParserConfigurationException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
Document doc = domSerializer.createDOM(cleaned);
assertEquals("html", doc.getDocumentElement().getNodeName());
assertEquals("HTML", doc.getDoctype().getName());
assertEquals("-//W3C//DTD HTML 4.01//EN", doc.getDoctype().getPublicId());
assertEquals("http://www.w3.org/TR/html4/strict.dtd", doc.getDoctype().getSystemId());
}
@Test
public void html_4_01_case_correct() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
String output = serializer.getAsString(cleaned);
assertTrue(output.startsWith("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"));
}
@Test
public void xhtml_1_1_serialize() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
String output = serializer.getAsString(cleaned);
assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\">"));
}
@Test
public void xhtml_1_0_strict_serialize() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
String output = serializer.getAsString(cleaned);
assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"));
}
@Test
public void xhtml_1_0_strict_serialize_case_correct() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
String output = serializer.getAsString(cleaned);
assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"));
}
@Test
public void html5_serialize() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
String output = serializer.getAsString(cleaned);
assertTrue(output.startsWith("<!DOCTYPE html>"));
}
@Test
public void html5_serialize_case_correct() throws IOException{
TagNode cleaned = cleaner.clean("<!DOCTYPE HTML><html><body></body></html>");
String output = serializer.getAsString(cleaned);
assertTrue(output.startsWith("<!DOCTYPE html>"));
}
//
// Misc
//
@Test
public void checkToString(){
TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
assertEquals(cleaned.getDocType().getContent(), cleaned.getDocType().toString());
}
}
@@ -0,0 +1,380 @@
/* Copyright (c) 2006-2019, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import org.jdom2.input.DOMBuilder;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import org.w3c.dom.Document;
public class DomSerializerTest extends AbstractHtmlCleanerTest {
@Test
public void removeInvalidTags3() throws Exception{
String html="<p><^-^></p>";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
final Document doc = new DomSerializer(cleanerProperties).createDOM(tagNode);
assertEquals("&lt;^-^&gt;", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
}
@Test
public void attributeCharacters() throws Exception{
String html="<p dispariție='dispariție.'></p>";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setAllowInvalidAttributeNames(false);
DomSerializer ser = new DomSerializer(cleanerProperties);
ser.setXmlVersion("1.1");
final Document doc = ser.createDOM(tagNode);
assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
}
@Test
public void attributeCharactersEncoded() throws Exception{
String html="<p dispari\u021bie='dispari\u021bie.'></p>";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setAllowInvalidAttributeNames(false);
DomSerializer ser = new DomSerializer(cleanerProperties);
ser.setXmlVersion("1.1");
final Document doc = ser.createDOM(tagNode);
assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
}
@Test
public void attributeCharacters2() throws Exception{
String html="<p t%st='dispariție.'></p>";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setAllowInvalidAttributeNames(false);
final Document doc = new DomSerializer(cleanerProperties).createDOM(tagNode);
assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
}
// See bug #203
@Test
public void parse2() throws Exception
{
String html = "<div foo=\"aaa&quot;bbb&amp;ccc&gt;ddd&lt;eee\">content</div>";
String expected = "<div foo=\"aaa&quot;bbb&amp;ccc&gt;ddd&lt;eee\">content</div>";
final CleanerProperties cleanerProperties = new CleanerProperties();
final TagNode tagNode = new HtmlCleaner().clean(html);
cleanerProperties.setOmitHtmlEnvelope(true);
cleanerProperties.setOmitXmlDeclaration(true);
String out = new SimpleXmlSerializer(cleanerProperties).getAsString(html);
assertEquals(expected, out);
}
// See bug #212
@Test
public void parse() throws Exception
{
String html = "<?xml version = \"1.0\"?><img src=\"http://xwiki.org?a=&amp;b\"/>";
String expected = "<img src=\"http://xwiki.org?a=&amp;b\" />";
final CleanerProperties cleanerProperties = new CleanerProperties();
final TagNode tagNode = new HtmlCleaner().clean(html);
final Document doc = new DomSerializer(cleanerProperties, true).createDOM(tagNode);
assertEquals("http://xwiki.org?a=&amp;b",
doc.getElementsByTagName("img").item(0).getAttributes().getNamedItem("src").getTextContent());
cleanerProperties.setOmitHtmlEnvelope(true);
cleanerProperties.setOmitXmlDeclaration(true);
String out = new SimpleXmlSerializer(cleanerProperties).getAsString(html);
assertEquals(expected, out);
}
@Test
public void removeInvalidTags() throws Exception{
String html="<p><^-^></p>";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
assertEquals("&lt;^-^&gt;", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
}
@Test
public void removeInvalidTags2() throws Exception{
String html="<p><1o/></p>";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
assertEquals("&lt;1o/&gt;", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
}
@Test
public void detectUnicodeSpaces() throws Exception{
String html="<meta\u00A0property=\"test\" content=\"value\">";
String expectedOutput= "test";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
assertEquals(expectedOutput, doc.getElementsByTagName("meta").item(0).getAttributes().getNamedItem("property").getTextContent());
}
@Test
public void preserveUnicodeTest() throws Exception
{
final String nonAsciiWord = "hemförsäkring";
final String html = "<html>"
+ "<body>"
+ "<p>"
+ nonAsciiWord
+ "</p>"
+ "</body>"
+ "</html>";
final String expectedOutput =
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"
+ "<html>\n" +
" <head/>\n" +
" <body>\n" +
" <p>" + nonAsciiWord + "</p>\n" +
" </body>\n" +
"</html>\n"
+ "";
final TagNode tagNode = new HtmlCleaner().clean(html);
final CleanerProperties cleanerProperties = new CleanerProperties();
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
assertEquals(expectedOutput, documentToString(doc));
}
// See Bug #215
@Test
public void invalidXMLElementName() throws ParserConfigurationException{
final String HTML = "<img srcset=\"<p%20\">";
final CleanerProperties cleanerProperties = new CleanerProperties();
//
// When we set allow to true, then we parse the attribute value as text
//
cleanerProperties.setAllowHtmlInsideAttributes(true);
TagNode tagNode = new HtmlCleaner(cleanerProperties).clean(HTML);
assertEquals(tagNode.getChildTags()[1].getChildTags()[0].getAttributeByName("srcset"),"<p%20");
//
// When we set allow to false, then we identify tags in attribute as new tags, and break
// into a new tag
//
cleanerProperties.setAllowHtmlInsideAttributes(false);
tagNode = new HtmlCleaner(cleanerProperties).clean(HTML);
//
// Not an issue for HTML, which accepts pretty much anything in a tag name
//
cleanerProperties.setOmitXmlDeclaration(true);
String output = new SimpleHtmlSerializer(cleanerProperties).getAsString(tagNode);
assertEquals("<html><head></head><body><img srcset=\"\" /><p%20></p%20></body></html>", output);
//
// But for XML DOM, we must follow the rules for building valid names, which means
// getting rid of the % sign
//
final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
assertEquals(1, doc.getDocumentElement().getElementsByTagName("p20").getLength());
}
@Test
public void errorChecking() throws ParserConfigurationException{
TagNode node = cleaner.clean("<p>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true, true, false);
Document document = ser.createDocument(node);
assertFalse(document.getStrictErrorChecking());
}
/**
* See issue 108
* @throws IOException
*/
@Test
@Ignore
public void html5doctype() throws Exception{
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
String initial = readFile("src/test/resources/test23.html");
TagNode tagNode = cleaner.clean(initial);
DomSerializer ser = new DomSerializer(cleaner.getProperties());
Document dom = ser.createDOM(tagNode);
assertNotNull(dom.getChildNodes().item(0).getChildNodes().item(0));
assertEquals("head", dom.getChildNodes().item(0).getChildNodes().item(0).getNodeName());
}
/**
* See issue 127
* @throws IOException
*/
@Test
public void rootNodeAttributes() throws Exception{
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
String initial = readFile("src/test/resources/test29.html");
TagNode tagNode = cleaner.clean(initial);
DomSerializer ser = new DomSerializer(cleaner.getProperties());
Document dom = ser.createDOM(tagNode);
assertNotNull(dom.getChildNodes().item(0).getChildNodes().item(0));
assertEquals("http://unknown.namespace.com", dom.getChildNodes().item(0).getAttributes().getNamedItem("xmlns").getNodeValue());
assertEquals("27", dom.getChildNodes().item(0).getAttributes().getNamedItem("id").getNodeValue());
//
// Check we have a real ID attribute in the DOM and not just a regular attribute
//
assertEquals("http://unknown.namespace.com", dom.getElementById("27").getAttribute("xmlns"));
}
@Test
public void cdata() throws Exception{
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
String initial = "<script> this &gt; that </script>";
TagNode tagNode = cleaner.clean(initial);
DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), true);
Document dom = ser.createDOM(tagNode);
DOMBuilder in = new DOMBuilder();
org.jdom2.Document jdomDoc = in.build(dom);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String actual = outputter.outputString(jdomDoc);
Assert.assertTrue(actual.contains("this > that"));
}
@Test
public void cdata2() throws Exception{
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
String initial = "<script> this &gt; that </script>";
TagNode tagNode = cleaner.clean(initial);
DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), false);
Document dom = ser.createDOM(tagNode);
DOMBuilder in = new DOMBuilder();
org.jdom2.Document jdomDoc = in.build(dom);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String actual = outputter.outputString(jdomDoc);
Assert.assertTrue(actual.contains("this &gt; that"));
}
@Test
public void escaping() throws Exception {
cleaner.getProperties().setTranslateSpecialEntities(true);
cleaner.getProperties().setAdvancedXmlEscape(true);
TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
Document dom = ser.createDOM(tagNode);
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
Assert.assertEquals(("£, £ and £"),actual);
}
@Test
public void escaping_2() throws Exception {
cleaner.getProperties().setTranslateSpecialEntities(false);
TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), false);
Document dom = ser.createDOM(tagNode);
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
Assert.assertEquals(("£, &pound; and &#163;"),actual);
}
@Test
public void escaping_3() throws Exception {
cleaner.getProperties().setTranslateSpecialEntities(false);
TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
Document dom = ser.createDOM(tagNode);
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
Assert.assertEquals(("£, &pound; and £"),actual);
}
@Test
public void escaping_4() throws Exception {
cleaner.getProperties().setRecognizeUnicodeChars(false);
TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
Document dom = ser.createDOM(tagNode);
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
Assert.assertEquals(("£, &pound; and &pound;"),actual);
}
@Test
public void escapingReservedCharactersTest() throws Exception {
cleaner.getProperties().setRecognizeUnicodeChars(false);
TagNode tagNode = cleaner.clean("<div>\" < > &</div>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
Document dom = ser.createDOM(tagNode);
String actual = dom.getElementsByTagName("div").item(0).getTextContent();
Assert.assertEquals(("&quot; &lt; &gt; &amp;"),actual);
}
//
// We shouldn't escape any characters in a comment
//
@Test
public void escapingCommentsTest() throws Exception {
cleaner.getProperties().setRecognizeUnicodeChars(false);
TagNode tagNode = cleaner.clean("<div><!--\" \' < > &--></div>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
Document dom = ser.createDOM(tagNode);
String actual = dom.getElementsByTagName("div").item(0).getChildNodes().item(0).getTextContent();
Assert.assertEquals(("\" \' < > &"),actual);
}
@Test
public void ncr() throws Exception {
cleaner.getProperties().setOmitComments(true);
cleaner.getProperties().setNamespacesAware(false);
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
cleaner.getProperties().setTranslateSpecialEntities(true);
TagNode tagNode = cleaner.clean("<div> &#8217; &#1078; &#253; &#247; &divide; </div>");
DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), false);
Document dom = ser.createDOM(tagNode);
DOMBuilder in = new DOMBuilder();
org.jdom2.Document jdomDoc = in.build(dom);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String actual = outputter.outputString(jdomDoc);
Assert.assertTrue(actual.contains(" ж ý ÷ ÷"));
}
}
@@ -0,0 +1,83 @@
/* Copyright (c) 2006-2014, the HtmlCleaner project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import junit.framework.TestCase;
public class EntityDeserializationTest extends TestCase {
private HtmlCleaner cleaner;
@Override
public void setUp() {
CleanerProperties cp = new CleanerProperties();
cp.setDeserializeEntities(true);
cleaner = new HtmlCleaner(cp);
}
@Override
public void tearDown() {
cleaner = null;
}
private void doTest(String input, String output) {
assertEquals(
output,
cleaner.clean("<html><body>" + input + "</body></html>")
.findElementByName("body", true)
.getText()
.toString()
);
}
public void testNamedEntity() {
doTest("&quot;", "\"");
}
public void testDecimalEntity() {
doTest("&#160;", "\u00a0");
}
public void testHexadecimalEntity() {
doTest("&#xa0;", "\u00a0");
}
public void testAbortedEntity() {
doTest("&amp&quot;", "&amp\"");
}
public void testCData() {
doTest("<script>"+CData.BEGIN_CDATA + "&amp;" + CData.END_CDATA+"</script>", "&amp;");
}
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,236 @@
/* Copyright (c) 2006-2013, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import org.jdom2.Document;
import org.jdom2.Namespace;
import org.jdom2.output.Format;
import org.jdom2.output.XMLOutputter;
import org.junit.Test;
public class JDomSerializerTest extends AbstractHtmlCleanerTest {
//
// Test that we create valid element names
//
@Test
public void elementNames() throws IOException{
String initial = "<img srcset=\"<p%20\">";
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head /><body><img srcset=\"\" /><p20 /></body></html>\n";
CleanerProperties props = new CleanerProperties();
props.setAddNewlineToHeadAndBody(false);
TagNode tagNode = new HtmlCleaner(props).clean(initial);
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String output = outputter.outputString(doc);
assertEquals(expected, output);
}
/**
* Tests that we comment CDATA in JDom
* @throws IOException
*/
@Test
public void safeCData1() throws IOException{
String initial = "<head><script type=\"text/javascript\"><![CDATA[alert(\"Hello World\")]]></script></head>";
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
CleanerProperties props = new CleanerProperties();
props.setOmitCdataOutsideScriptAndStyle(true);
props.setAddNewlineToHeadAndBody(false);
TagNode tagNode = new HtmlCleaner(props).clean(initial);
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String output = outputter.outputString(doc);
assertEquals(expected, output);
}
/**
* Tests that we comment CDATA in JDom; in this case preserving existing comments
* @throws IOException
*/
@Test
public void safeCData2() throws IOException{
String initial = "<head><script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script></head>";
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
CleanerProperties props = new CleanerProperties();
props.setOmitCdataOutsideScriptAndStyle(true);
props.setAddNewlineToHeadAndBody(false);
TagNode tagNode = new HtmlCleaner(props).clean(initial);
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String output = outputter.outputString(doc);
assertEquals(expected, output);
}
/**
* Tests that we comment CDATA in JDom; in this case that we normalise comment style
* @throws IOException
*/
@Test
public void safeCData3() throws IOException{
String initial = "<head><script type=\"text/javascript\">/*<![CDATA[*/alert(\"Hello World\")\n/*]]>*/</script></head>";
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
CleanerProperties props = new CleanerProperties();
props.setOmitCdataOutsideScriptAndStyle(true);
props.setAddNewlineToHeadAndBody(false);
TagNode tagNode = new HtmlCleaner(props).clean(initial);
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String output = outputter.outputString(doc);
assertEquals(expected, output);
}
/**
* Tests that we comment CDATA in JDom; in this case a more complex example
* @throws IOException
*/
@Test
public void safeCData4() throws IOException{
String initial = readFile("src/test/resources/test33.html");
String expected = readFile("src/test/resources/test33_expected.html");;
CleanerProperties props = new CleanerProperties();
props.setOmitCdataOutsideScriptAndStyle(true);
props.setAddNewlineToHeadAndBody(false);
TagNode tagNode = new HtmlCleaner(props).clean(initial);
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String output = outputter.outputString(doc);
assertEquals(expected, output);
}
/**
* Tests that we comment CDATA in JDom
* @throws IOException
*/
@Test
public void safeCData5() throws IOException{
String initial = "<head><script>&lt;&gt;</script></head>";
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script>/*<![CDATA[*/\n<>\n/*]]>*/</script></head><body /></html>\n";
CleanerProperties props = new CleanerProperties();
props.setOmitCdataOutsideScriptAndStyle(true);
props.setUseCdataForScriptAndStyle(true);
props.setDeserializeEntities(true);
props.setAddNewlineToHeadAndBody(false);
TagNode tagNode = new HtmlCleaner(props).clean(initial);
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String output = outputter.outputString(doc);
assertEquals(expected, output);
}
/**
* Tests that we comment CDATA in JDom; this test uses CSS
* @throws IOException
*/
@Test
public void safeCData6() throws IOException{
String initial = "<head><style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style></head>";
String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>\n";
CleanerProperties props = new CleanerProperties();
props.setOmitCdataOutsideScriptAndStyle(true);
props.setUseCdataForScriptAndStyle(true);
props.setAddNewlineToHeadAndBody(false);
TagNode tagNode = new HtmlCleaner(props).clean(initial);
Document doc = new JDomSerializer(props, true).createJDom(tagNode);
XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
String output = outputter.outputString(doc);
assertEquals(expected, output);
}
/**
* See issue #95
*/
@Test
public void testNPE(){
String validhtml5StringCode = "<html></html>";
CleanerProperties props = new CleanerProperties();
props.setOmitHtmlEnvelope(true);
TagNode tagNode = new HtmlCleaner(props).clean(validhtml5StringCode);
new JDomSerializer(props, true).createJDom(tagNode);
}
/**
* See issue 106
* @throws IOException
*/
@Test
public void CDATA() throws Exception{
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
String initial = readFile("src/test/resources/test22.html");
TagNode tagNode = cleaner.clean(initial);
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
Document doc = ser.createJDom(tagNode);
assertEquals("org.jdom2.CDATA", doc.getRootElement().getChild("head").getChild("script").getContent().get(1).getClass().getName());
}
/**
* See issue 106
* @throws IOException
*/
@Test
public void noCDATA() throws Exception{
cleaner.getProperties().setUseCdataForScriptAndStyle(false);
cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
String initial = readFile("src/test/resources/test22.html");
TagNode tagNode = cleaner.clean(initial);
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
Document doc = ser.createJDom(tagNode);
assertEquals("org.jdom2.Text", doc.getRootElement().getChild("head").getChild("script").getContent().get(0).getClass().getName());
}
/**
* Test we handle foreign markup OK
* @throws Exception
*/
@Test
public void namespaces() throws Exception{
cleaner.getProperties().setNamespacesAware(true);
String initial = readFile("src/test/resources/test21.html");
TagNode tagNode = cleaner.clean(initial);
JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
Document doc = ser.createJDom(tagNode);
//
// These will fail with an NPE if the namespaces are not correct
//
doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getNamespaceURI();
doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getChild("svg", Namespace.getNamespace("http://www.w3.org/2000/svg")).getNamespaceURI();
doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getChild("svg", Namespace.getNamespace("http://www.w3.org/2000/svg")).getChild("title", Namespace.getNamespace("http://www.w3.org/2000/svg"));
}
}
@@ -0,0 +1,63 @@
/* Copyright (c) 2006-2017, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import java.io.IOException;
import org.junit.Test;
public class MathMLTest extends AbstractHtmlCleanerTest{
/**
* Check that inline MathML statements remain inline. See bug #193
* @throws IOException
*/
@Test
public void mathMLNamespaces() throws IOException{
String input = readFile("src/test/resources/test35.html");
String expected = readFile("src/test/resources/test35_expected.html");
assertCleaned(input,expected);
}
/**
* Check that MathML is properly formed. See bug #204
* @throws IOException
*/
@Test
public void mathML() throws IOException{
String input = readFile("src/test/resources/test36.html");
String expected = readFile("src/test/resources/test36_expected.html");
assertCleaned(input,expected);
}
}
@@ -0,0 +1,175 @@
/* Copyright (c) 2006-2013, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import java.io.IOException;
import org.junit.Test;
public class NamespacesTest extends AbstractHtmlCleanerTest{
/**
* Tests that we can add in the xlink NS declaration automatically if there is an xlink:href attribute with
* no xmlns attribute.
* @throws IOException
*/
@Test
public void missingDeclaration() throws IOException{
String initial = "<p xlink:href=\"#someHeading\"/>";
String expected = "<html xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<head />\n<body><p xlink:href=\"#someHeading\"></p></body></html>";
assertCleaned(initial, expected);
}
/**
* Tests that we can handle XMLNS="" attributes. See issue #135
* @throws IOException
*/
@Test
public void xmlnsAttributeInUpperCase() throws IOException{
String initial = "<BANANA XMLNS=\"BANANA\"/>";
String expected = "<html>\n<head />\n<body><BANANA XMLNS=\"BANANA\" /></body></html>";
assertCleaned(initial, expected);
}
@Test
public void xmlnsAttributeAndPrefix() throws IOException{
String initial = "\n<head />\n<body><xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\"/>";
String expected = "<html>\n<head />\n<body>\n<xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\" /></body></html>";
assertCleaned(initial, expected);
}
@Test
public void xmlnsAttributeAndPrefix2() throws IOException{
String initial = "<xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\"/>";
String expected = "<html>\n<head />\n<body><xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\" /></body></html>";
assertCleaned(initial, expected);
}
/**
* Tests that we can handle xmlns="" attributes. See issue #135
* @throws IOException
*/
@Test
public void emptyNamespaces() throws IOException{
String initial = readFile("src/test/resources/test32.html");
String expected = "<html>\n<head />\n<body><a href=\"link.html\"><img /></a><p>Text</p></body></html>";
assertCleaned(initial, expected);
}
/**
* Uses an RDFa example to test that we retain namespace declarations. See issue #63
* @throws IOException
*/
@Test
public void RDFa() throws IOException{
String initial = readFile("src/test/resources/test13.html");
String expected = readFile("src/test/resources/test13_expected.html");
assertCleaned(initial, expected);
}
/**
* Uses a namespace prefix for an element. See issue #63
* @throws IOException
*/
@Test
public void DCElement() throws IOException{
String initial = readFile("src/test/resources/test14.html");
String expected = readFile("src/test/resources/test14_expected.html");
assertCleaned(initial, expected);
}
/**
* Uses a namespace prefix for an attribute. See issue #63
* @throws IOException
*/
@Test
public void DCAttribute() throws IOException{
String initial = readFile("src/test/resources/test15.html");
String expected = readFile("src/test/resources/test15_expected.html");
assertCleaned(initial, expected);
}
/**
* If we aren't NS aware, strip out the xmlns attr and process everything
* as HTML.
*/
@Test
public void testTableCellsWithoutNamespaceAwareness() throws IOException{
cleaner.getProperties().setNamespacesAware(false);
String initial = readFile("src/test/resources/test26.html");
String expected = readFile("src/test/resources/test26_expected.html");
assertCleaned(initial, expected);
}
/**
* If we are namespace-aware and use the legacy HTML namespace, we should
* treat the content as HTML. See issue #115
*/
@Test
public void testTableCellsUsingNamespaceAwareAndLegacyHtmlNS() throws IOException{
cleaner.getProperties().setNamespacesAware(true);
cleaner.getProperties().setOmitUnknownTags(true);
String initial = readFile("src/test/resources/test26.html");
String expected = readFile("src/test/resources/test26_expected.html");
assertCleaned(initial, expected);
}
/**
* If we're NS-aware and using XHTML, treat the content as HTML tags and
* insert TBODY into the table (etc) but retain the xmlns attr on the html
* tag
*/
@Test
public void testTableCellsUsingNamespaceAwareAndXhtmlNS() throws IOException{
cleaner.getProperties().setNamespacesAware(true);
cleaner.getProperties().setOmitUnknownTags(true);
String initial = readFile("src/test/resources/test27.html");
String expected = readFile("src/test/resources/test27_expected.html");
assertCleaned(initial, expected);
}
/**
* If we are namespace-aware and use an unknown namespace,
* all the content will be treated as foreign markup; this means
* there will be no insertion of TBODY tags as the table element
* is not interpreted as being a HTML table element
*/
@Test
public void testTableCellsUsingNamespaceAwareAndUnknownNS() throws IOException{
cleaner.getProperties().setNamespacesAware(true);
cleaner.getProperties().setOmitUnknownTags(true);
String initial = readFile("src/test/resources/test28.html");
String expected = readFile("src/test/resources/test28_expected.html");
assertCleaned(initial, expected);
}
}
@@ -0,0 +1,34 @@
package org.htmlcleaner;
import junit.framework.TestCase;
import org.junit.Test;
public class NestingTest extends TestCase {
public final static int TOO_DEEP_NESTING = 9999;
public final static String TOO_DEEP_DOC = _nestedDoc(TOO_DEEP_NESTING, "<div>", "</div>", "");
public static String _nestedDoc(int nesting, String open, String close, String content) {
StringBuilder sb = new StringBuilder(nesting * (open.length() + close.length()));
for (int i = 0; i < nesting; ++i) {
sb.append(open);
if ((i & 31) == 0) {
sb.append("\n");
}
}
sb.append("\n").append(content).append("\n");
for (int i = 0; i < nesting; ++i) {
sb.append(close);
if ((i & 31) == 0) {
sb.append("\n");
}
}
return sb.toString();
}
@Test
public void testDeepNesting(){
HtmlCleaner cleaner = new HtmlCleaner();
TagNode root = cleaner.clean(TOO_DEEP_DOC);
}
}
@@ -0,0 +1,663 @@
package org.htmlcleaner;
import java.io.File;
import java.io.IOException;
import java.util.regex.Matcher;
import junit.framework.TestCase;
/**
* Testing node manipulation after cleaning.
* TODO String escaping tests should be moved to UtilsTest class [Eugene]
* @author Eugene Sapozhnikov (blackorangebox@gmail.com)
*/
public class PropertiesTest extends TestCase {
/**
* Test behavour of creating a new cleaner with properties including
* tag provider set
* @throws Exception
*/
public void initialiseCleanerWithProperties() throws Exception {
CleanerProperties properties = new CleanerProperties();
properties.setTagInfoProvider(Html5TagProvider.INSTANCE);
HtmlCleaner cleaner = new HtmlCleaner(properties);
assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
properties = new CleanerProperties();
properties.setTagInfoProvider(null);
cleaner = new HtmlCleaner(properties);
assertTrue(cleaner.getTagInfoProvider() instanceof DefaultTagProvider);
properties = new CleanerProperties();
properties.setTagInfoProvider(null);
cleaner = new HtmlCleaner(null,properties);
assertTrue(cleaner.getTagInfoProvider() instanceof DefaultTagProvider);
properties = new CleanerProperties();
properties.setTagInfoProvider(null);
cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE, properties);
assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
properties = new CleanerProperties();
properties.setTagInfoProvider(DefaultTagProvider.INSTANCE);
cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE, properties);
assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
}
public void testPropertiesAdvancedXmlEscape() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setNamespacesAware(false);
String xmlString;
properties.setAdvancedXmlEscape(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div>&amp;&quot;&apos;&lt;&gt;</div>") >= 0);
properties.setAdvancedXmlEscape(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString, xmlString.indexOf("<div>&amp;amp;&amp;quot;&amp;apos;&amp;lt;&amp;gt;</div>") >= 0);
}
public void testUseCdataForScriptAndStyle() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setUseCdataForScriptAndStyle(true);
xmlString = getXmlString(cleaner, properties);
String expected = "<script>" + CData.SAFE_BEGIN_CDATA + "\nvar x=y&&z;\n" + CData.SAFE_END_CDATA
+ "</script>";
assertTrue("looking for :\"" + expected + "\" in :\n" + xmlString, xmlString.indexOf(expected) >= 0);
expected = "<style>" + CData.SAFE_BEGIN_CDATA + "\n.test{font-size:10;}\n" + CData.SAFE_END_CDATA
+ "</style>";
assertTrue("looking for :\"" + expected + "\" in :\n" + xmlString, xmlString.indexOf(expected) >= 0);
properties.setUseCdataForScriptAndStyle(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<script>var x=y&amp;&amp;z;</script>") >= 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<style>.test{font-size:10;}</style>") >= 0);
}
public void testTranslateSpecialEntities() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setTranslateSpecialEntities(true);
String specialHtmlEntities = "<div>" + new String(new char[] { 244, 8240, 215, 376, 8364 }) + "</div>";
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf(specialHtmlEntities) >= 0);
properties.setTranslateSpecialEntities(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf(specialHtmlEntities) < 0);
}
public void testRecognizeUnicodeChars() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
String unicodeCharString = "<div>" + new String(new char[] { 352, 8224, 8249 }) + "</div>";
properties.setRecognizeUnicodeChars(true);
assertTrue(getXmlString(cleaner, properties).indexOf(unicodeCharString) >= 0);
properties.setRecognizeUnicodeChars(false);
assertTrue(getXmlString(cleaner, properties).indexOf(unicodeCharString) < 0);
assertTrue(getXmlString(cleaner, properties).indexOf("<div>&amp;#352;&amp;#8224;&amp;#8249;</div>") >= 0);
}
public void testOmitUnknownTags() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setOmitUnknownTags(true);
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag>content of unknown tag</mytag>") < 0);
assertTrue(getXmlString(cleaner, properties).indexOf("content of unknown tag") >= 0);
properties.setOmitUnknownTags(false);
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag>content of unknown tag</mytag>") >= 0);
}
public void testTreatUnknownTagsAsContent() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setNamespacesAware(false);
properties.setOmitUnknownTags(false);
properties.setTreatUnknownTagsAsContent(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("&lt;mytag&gt;content of unknown tag&lt;/mytag&gt;") >= 0);
properties.setTreatUnknownTagsAsContent(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<mytag>content of unknown tag</mytag>") >= 0);
}
public void testNamespacesAware() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setNamespacesAware(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<html xmlns:my=\"my\">") >= 0);
assertTrue(xmlString.indexOf("<my:tag id=\"xxx\">aaa</my:tag>") >= 0);
properties.setNamespacesAware(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<html") >= 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<tag id=\"xxx\">aaa</tag>") >= 0);
}
public void testOmitDeprecatedTags() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setOmitDeprecatedTags(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") < 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("content of deprecated tag") >= 0);
properties.setOmitDeprecatedTags(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") >= 0);
}
public void testTreatDeprecatedTagsAsContent() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setOmitDeprecatedTags(false);
properties.setTreatDeprecatedTagsAsContent(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("&lt;u&gt;content of deprecated tag&lt;/u&gt;") >= 0);
properties.setTreatDeprecatedTagsAsContent(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") >= 0);
}
/**
* @throws IOException
*/
public void testOmitComments() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setNamespacesAware(false);
properties.setOmitComments(false);
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") >= 0);
properties.setOmitComments(true);
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") < 0);
}
public void testUseEmptyElementTags() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
// Tag <a> connot be collapsed according to DefaultTagProvider
properties.setUseEmptyElementTags(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<a href=\"index.php\" />") < 0);
assertTrue(xmlString.indexOf("<a href=\"index.php\"></a>") >= 0);
properties.setUseEmptyElementTags(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<a href=\"index.php\"></a>") >= 0);
properties.setUseEmptyElementTags(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<br />") >= 0);
xmlString = getXmlString(cleaner, properties);
// jericho reports that td can not be empty. so we test on <tr/>
// collapsing
assertTrue(xmlString, xmlString.indexOf("<tr><td></td></tr><tr />") >= 0);
properties.setUseEmptyElementTags(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<table><tbody><tr><td></td></tr><tr></tr></tbody></table>") >= 0);
}
public void testAllowMultiWordAttributes() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setUseEmptyElementTags(false);
properties.setAllowMultiWordAttributes(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div att=\"a b c\">") < 0);
assertTrue(xmlString.indexOf("<div att=\"a\" b=\"b\" c=\"c\">") >= 0);
properties.setAllowMultiWordAttributes(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div att=\"a b c\">") >= 0);
properties.setAllowHtmlInsideAttributes(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") >= 0);
properties.setAllowHtmlInsideAttributes(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") < 0);
assertTrue(xmlString.indexOf("<a title=\"\"><b>Title<b> is here&quot;&gt;LINK 1</b></b></a>") >= 0);
properties.setIgnoreQuestAndExclam(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") < 0);
assertTrue(xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") < 0);
properties.setIgnoreQuestAndExclam(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") >= 0);
assertTrue(xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") >= 0);
properties.setNamespacesAware(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<html xmlns:my=\"my\">") >= 0);
assertTrue(xmlString.indexOf("<my:tag id=\"xxx\">aaa</my:tag>") >= 0);
properties.setNamespacesAware(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<html") >= 0);
assertTrue(xmlString.indexOf("<tag id=\"xxx\">aaa</tag>") >= 0);
}
public void testAllowHtmlInsideAttributes() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setAllowHtmlInsideAttributes(true);
xmlString = getXmlString(cleaner, properties);
assertTrue( xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") >= 0 );
properties.setAllowHtmlInsideAttributes(false);
xmlString = getXmlString(cleaner, properties);
assertTrue( xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") < 0 );
xmlString = getXmlString(cleaner, properties);
assertTrue( xmlString.indexOf("<a title=\"\"><b>Title<b> is here&quot;&gt;LINK 1</b></b></a>") >= 0 );
}
public void testIgnoreQuestAndExclam() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setIgnoreQuestAndExclam(true);
xmlString = getXmlString(cleaner, properties);
assertTrue( xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") < 0 );
xmlString = getXmlString(cleaner, properties);
assertTrue( xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") < 0 );
properties.setIgnoreQuestAndExclam(false);
xmlString = getXmlString(cleaner, properties);
assertTrue( xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") >= 0 );
xmlString = getXmlString(cleaner, properties);
assertTrue( xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") >= 0 );
}
/**
* @throws IOException
*/
public void testComments() throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setNamespacesAware(false);
properties.setOmitComments(false);
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") >= 0);
properties.setOmitComments(true);
assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") < 0);
properties.setOmitComments(false);
assertTrue(getXmlString(cleaner, properties).indexOf("<!-- comment with == - hyphen -->") >= 0);
properties.setHyphenReplacementInComment("*");
assertTrue(getXmlString(cleaner, properties).indexOf("<!-- comment with ** - hyphen -->") >= 0);
}
/**
* @throws IOException
*/
public void testOmitXmlDeclaration() throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setNamespacesAware(false);
properties.setOmitXmlDeclaration(false);
assertTrue(getXmlString(cleaner, properties).indexOf("<?xml version=\"1.0\"") >= 0);
properties.setOmitXmlDeclaration(true);
assertTrue(getXmlString(cleaner, properties).indexOf("<?xml version=\"1.0\"") < 0);
}
public void testOmitDoctypeDeclaration() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setAdvancedXmlEscape(false);
properties.setOmitDoctypeDeclaration(false);
assertTrue(getXmlString(cleaner, properties).indexOf(
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">") >= 0);
properties.setOmitDoctypeDeclaration(true);
assertTrue(getXmlString(cleaner, properties).indexOf(
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">") < 0);
}
/**
* @throws IOException
*/
public void testOmitHtmlEnvelope() throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setHtmlVersion(4);
properties.setNamespacesAware(false);
properties.setAddNewlineToHeadAndBody(false);
String xmlString;
properties.setOmitHtmlEnvelope(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<html><head>") < 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("</body></html>") < 0);
properties.setOmitHtmlEnvelope(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString, xmlString.indexOf("<html><head>") >= 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString, xmlString.indexOf("</body></html>") >= 0);
}
/**
* @throws IOException
*/
public void testOmitHtml5Envelope() throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setHtmlVersion(5);
properties.setNamespacesAware(false);
properties.setAddNewlineToHeadAndBody(false);
String xmlString;
properties.setOmitHtmlEnvelope(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<html><head>") < 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("</body></html>") < 0);
properties.setOmitHtmlEnvelope(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString, xmlString.indexOf("<html><head><style>") >= 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString, xmlString.indexOf("</body></html>") >= 0);
}
public void testPruneProperties() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.reset();
properties.setPruneTags("div,mytag");
String xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div") < 0);
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag") < 0);
properties.setPruneTags("");
properties.setAllowTags("html,body,div");
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div") >= 0);
assertTrue(getXmlString(cleaner, properties).indexOf("<mytag") < 0);
}
public void testEmptyAttributesProperties() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.reset();
String xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<input checked=\"checked\" />") >= 0);
properties.setBooleanAttributeValues("empty");
assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"\" />") >= 0);
properties.setBooleanAttributeValues("true");
assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"true\" />") >= 0);
properties.setBooleanAttributeValues("selft");
assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"checked\" />") >= 0);
}
private String getXmlString(HtmlCleaner cleaner, CleanerProperties properties) throws IOException {
TagNode node = cleaner.clean(new File("src/test/resources/test4.html"), "UTF-8");
String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
return xmlString;
}
public void testNbsp() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setTranslateSpecialEntities(false);
properties.setOmitDoctypeDeclaration(false);
properties.setOmitXmlDeclaration(true);
properties.setAdvancedXmlEscape(true);
properties.setAddNewlineToHeadAndBody(false);
// test first when generating xml
TagNode node = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
+ "<div>&#x20;&amp;&quot;&apos;'&lt;&gt;&nbsp;&garbage;&</div>");
SimpleXmlSerializer simpleXmlSerializer = new SimpleXmlSerializer(properties);
String xmlString = simpleXmlSerializer.getAsString(node, "UTF-8");
assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
+ "<html><head /><body><div> &amp;&quot;&apos;&apos;&lt;&gt;" + String.valueOf((char) 160)
+ "&amp;garbage;&amp;</div></body></html>", xmlString.trim());
simpleXmlSerializer.setCreatingHtmlDom(true);
// then test when generating html
String domString = simpleXmlSerializer.getAsString(node, "UTF-8");
assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" +
// "<html><head /><body><div> &amp;&quot;&#39;&#39;&lt;&gt;&nbsp;&amp;garbage;&amp;</div></body></html>",
"<html><head /><body><div> &amp;&quot;''&lt;&gt;&nbsp;&amp;garbage;&amp;</div></body></html>",
domString.trim());
}
/**
* make sure that the unicode character has leading 'x'.
* <ul>
* <li>&#138A; is converted by FF to 3 characters: &#138; + 'A' + ';'</li>
* <li>&#0x138A; is converted by FF to 6? 7? characters: &#0 'x'+'1'+'3'+
* '8' + 'A' + ';' #0 is displayed kind of weird</li>
* <li>&#x138A; is a single character</li>
* </ul>
*
* @throws Exception
*/
public void testHexConversion() throws Exception {
CleanerProperties properties = new CleanerProperties();
properties.setOmitHtmlEnvelope(true);
properties.setOmitXmlDeclaration(true);
SimpleXmlSerializer simpleXmlSerializer = new SimpleXmlSerializer(properties);
simpleXmlSerializer.setCreatingHtmlDom(false);
String xmlString = simpleXmlSerializer.getAsString( "<div>&#138A;</div>");
assertEquals("<div>"+new String(new char[] {138, 'A',';'})+"</div>", xmlString);
xmlString = simpleXmlSerializer.getAsString( "<div>&#x138A;</div>");
assertEquals("<div>"+new String(new char[] {0x138A})+"</div>", xmlString);
properties.reset();
}
public void testPattern() {
for (Object[] test : new Object[][] {
new Object[] { "0x138A;", false, -1, -1, null, true, 0, 7, "x138A", true, 0, 1, "0" },
new Object[] { "x138A;", true, 0, 6, "x138A", true, 0, 6, "x138A", false, -1, -1, null },
new Object[] { "138;", false, -1, -1, null, false, -1, -1, null, true, 0, 4, "138" },
new Object[] { "139", false, -1, -1, null, false, -1, -1, null, true, 0, 3, "139" },
new Object[] { "x13A", true, 0, 4, "x13A", true, 0, 4, "x13A", false, -1, -1, null },
new Object[] { "13F", false, -1, -1, null, false, -1, -1, null, true, 0, 2, "13" },
new Object[] { "13", false, -1, -1, null, false, -1, -1, null, true, 0, 2, "13" },
new Object[] { "X13AZ", true, 0, 4, "X13A", true, 0, 4, "X13A", false, -1, -1, null } }) {
int i = 0;
String input = (String) test[i++];
boolean strict = (Boolean) test[i++];
int sstart = (Integer) test[i++];
int send = (Integer) test[i++];
String sgroup = (String) test[i++];
boolean relaxed = (Boolean) test[i++];
int rstart = (Integer) test[i++];
int rend = (Integer) test[i++];
String rgroup = (String) test[i++];
boolean decimal = (Boolean) test[i++];
int dstart = (Integer) test[i++];
int dend = (Integer) test[i++];
String dgroup = (String) test[i++];
Matcher m = Utils.HEX_STRICT.matcher(input);
boolean actual = m.find();
assertEquals(input, strict, actual);
if (actual) {
assertEquals(input + " strict start ", sstart, m.start());
assertEquals(input + " strict end ", send, m.end());
assertEquals(input + " strict group ", sgroup, m.group(1));
}
m = Utils.HEX_RELAXED.matcher(input);
actual = m.find();
assertEquals(input, relaxed, actual);
if (actual) {
assertEquals(input + " relaxed start ", rstart, m.start());
assertEquals(input + " relaxed end ", rend, m.end());
assertEquals(input + " relaxed group ", rgroup, m.group(1));
}
m = Utils.DECIMAL.matcher(input);
actual = m.find();
assertEquals(input, decimal, actual);
if (actual) {
assertEquals(input + " decimal start ", dstart, m.start());
assertEquals(input + " decimal end ", dend, m.end());
assertEquals(input + " decimal group ", dgroup, m.group(1));
}
}
}
public void testConvertUnicode() throws Exception {
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitHtmlEnvelope(true);
cleanerProperties.setOmitXmlDeclaration(true);
cleanerProperties.setUseEmptyElementTags(false);
// right tick is special unicode character 8217
String output = new SimpleXmlSerializer(cleanerProperties).getAsString(
"<h3><u><strong>Presidents Message</strong></u><div> </h3>");
assertEquals("<h3><u><strong>Presidents Message</strong></u><div> </div></h3>", output);
}
private static final String HTML_COMMENT_OUT_BEGIN = "<html><head><script>";
private static final String HTML_COMMENT_OUT_END = "</script></head><body></body></html>";
private static final String SAMPLE_JS = "var x = ['foo','bar'];";
private static final String COMMENT_START = "<!--";
private static final String COMMENT_END = "-->";
/**
* Test conversion of former ( now bad practice ) of:
*
* <pre>
* &lt;style>&lt;!-- style info -->&lt;/style>
* </pre>
*
* into &lt;style>/(star)&lt;![CDATA[(star)/ style info
* /(star)]]>(star)/&lt;/style>
*
* Note: disabled because it doesn't test actual behavior
* @throws IOException
*/
public void disabledTestConvertOldStyleComments() throws IOException {
// TODO: May need additional flag to handle '<' inside of scripts
// dontEscape() in xml serializer should not be triggered based on use
// cdata
// but dontEscape is used by subclasses -- need to investigate best
// solution.
// maybe o.k. to have the < > be translated. That is what original test
// does.
// but the ' should probably not be touched??
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = new CleanerProperties();
properties.setOmitXmlDeclaration(true);
properties.setUseCdataForScriptAndStyle(true);
properties.setAddNewlineToHeadAndBody(false);
// test for positive matches to old-style comment hacks
for (String[] testData : new String[][] {
// normal case - remove old-style comment out hack
new String[] {
HTML_COMMENT_OUT_BEGIN + "//" + COMMENT_START + "\n" + SAMPLE_JS + "//" + COMMENT_END + "\n"
+ HTML_COMMENT_OUT_END,
HTML_COMMENT_OUT_BEGIN + CData.SAFE_BEGIN_CDATA + "\n" + SAMPLE_JS
+ CData.SAFE_END_CDATA + "\n" + HTML_COMMENT_OUT_END },
// don't let random whitespace confuse things
new String[] {
HTML_COMMENT_OUT_BEGIN + "\n\n\n\n" + "//" + " \t" + COMMENT_START + "\n" + SAMPLE_JS
+ "\n\n\n" + "//" + COMMENT_END + "\n\n\t\n" + HTML_COMMENT_OUT_END,
HTML_COMMENT_OUT_BEGIN + "\n\n\n\n" + CData.SAFE_BEGIN_CDATA + "\n" + SAMPLE_JS
+ "\n\n\n" + "//" + CData.SAFE_END_CDATA + "\n\n\t\n" + HTML_COMMENT_OUT_END },
}) {
doTestConvertOldStyleComments(cleaner, properties, testData);
}
// test for false positives
for (String[] testData : new String[][] {
// make sure not to remove real comments
new String[] {
HTML_COMMENT_OUT_BEGIN + "//" + "an ordinary comment" + "\n" + SAMPLE_JS + "//" + "a final remark"
+ HTML_COMMENT_OUT_END,
HTML_COMMENT_OUT_BEGIN + CData.SAFE_BEGIN_CDATA + "//" + "an ordinary comment" + "\n"
+ SAMPLE_JS + "//" + "a final remark" + CData.SAFE_END_CDATA + HTML_COMMENT_OUT_END }, }) {
doTestConvertOldStyleComments(cleaner, properties, testData);
}
}
/**
* @param cleaner
* @param properties
* @param testData
*/
private void doTestConvertOldStyleComments(HtmlCleaner cleaner, CleanerProperties properties, String[] testData)
throws IOException {
TagNode node = cleaner.clean(testData[0]);
// test to make sure the no-op still works
properties.setUseCdataForScriptAndStyle(false);
String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
assertEquals(testData[0], xmlString);
// now test actual
properties.setUseCdataForScriptAndStyle(true);
xmlString = new SimpleXmlSerializer(properties).getAsString(node);
assertEquals(testData[1], xmlString);
}
public void testIgnoreClosingCData() throws IOException {
String html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta http-equiv=\"content-type\" content=\"application/xhtml+xml; charset=utf-8\" /><link href=\"aswa.css\" type=\"text/css\" rel=\"stylesheet\" /><title>ASWA - Events</title>"
+ "<style type=\"text/css\">/*<![CDATA[*/\r\n"
+ "#ampmep_188 { }\r\n"
+ "/*]]>*/</style></head><body></body></html>";
CleanerProperties properties = new CleanerProperties();
properties.setOmitXmlDeclaration(true);
properties.setUseCdataForScriptAndStyle(true);
properties.setAddNewlineToHeadAndBody(false);
properties.setIgnoreQuestAndExclam(false);
HtmlCleaner cleaner = new HtmlCleaner(properties);
TagNode node = cleaner.clean(html);
//properties.setUseCdataForScriptAndStyle(false);
String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
assertEquals(html, xmlString);
}
public void testTransResCharsToNCR() throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
String xmlString;
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(true);
properties.setTransResCharsToNCR(true);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div>1.&#38;&#34;&#39;&#60;&#62;</div>") >= 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div>2.&#38;&#34;&#39;&#60;&#62;</div>") >= 0);
properties.setTransResCharsToNCR(false);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div>1.&amp;&quot;&apos;&lt;&gt;</div>") >= 0);
xmlString = getXmlString(cleaner, properties);
assertTrue(xmlString.indexOf("<div>2.&amp;&quot;&apos;&lt;&gt;</div>") >= 0);
}
}
@@ -0,0 +1,21 @@
package org.htmlcleaner;
import java.io.IOException;
import junit.framework.TestCase;
/**
* Simple test to check that randomly appeared end tags are dropped out.
*/
public class RandomCloseTagTest extends TestCase {
public void testRandomCloseTagsRemoved() throws IOException{
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
properties.setOmitHtmlEnvelope(true);
properties.setOmitXmlDeclaration(true);
SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
TagNode cleaned = cleaner.clean("Some</span> text </b></div>");
assertEquals("Some text ", serializer.getAsString(cleaned));
}
}
@@ -0,0 +1,19 @@
package org.htmlcleaner;
import java.io.File;
import java.io.IOException;
import junit.framework.TestCase;
public class RandomPageTest extends TestCase {
public void testPage() throws IOException {
HtmlCleaner cleaner = new HtmlCleaner();
cleaner.clean( new File("src/test/resources/gg_prob.html") );
}
public void testHtml() throws IOException{
HtmlCleaner cleaner = new HtmlCleaner();
cleaner.clean( new File("src/test/resources/gg_prob_cleaned.html") );
}
}
+223
View File
@@ -0,0 +1,223 @@
/* Copyright (c) 2006-2013, the HtmlCleaner Project
All rights reserved.
Redistribution and use of this software in source and binary forms,
with or without modification, are permitted provided that the following
conditions are met:
* Redistributions of source code must retain the above
copyright notice, this list of conditions and the
following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other
materials provided with the distribution.
* The name of HtmlCleaner may not be used to endorse or promote
products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
package org.htmlcleaner;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import org.junit.Ignore;
import org.junit.Test;
public class SVGTest extends AbstractHtmlCleanerTest{
@Test
public void svgTreatedAsPhrasing() throws IOException
{
CleanerProperties cleanerProperties = new CleanerProperties();
cleanerProperties.setOmitXmlDeclaration(false);
cleanerProperties.setOmitDoctypeDeclaration(false);
cleanerProperties.setIgnoreQuestAndExclam(false);
cleanerProperties.setAddNewlineToHeadAndBody(false);
cleanerProperties.setUseCdataFor("script,style,altscript");
this.cleaner = new HtmlCleaner(cleanerProperties);
this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
assertHTML(
"<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\"><circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\" /></svg></p>",
"<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\"><circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\"></circle></svg></p>"
);
}
@Test
public void nestedSVG()
{
String html = "<!DOCTYPE html>\n"
+ "<html lang=\"en\">\n"
+ "<head>\n"
+ "</head>\n"
+ "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
+ "<svg xmlns=\"http://www.w3.org/2000/\">\n"
+ " <svg></svg>\n"
+ "</svg>\n"
+ "</body>\n"
+ "</html>";
new HtmlCleaner().clean(html);
html = "<!DOCTYPE html>\n"
+ "<html lang=\"en\">\n"
+ "<head>\n"
+ "</head>\n"
+ "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
+ "<svg xmlns=\"http://www.w3.org/2000/svg\">\n"
+ " <circle cx=\"50\" cy=\"50\" r=\"40\" stroke=\"black\" stroke-width=\"3\" fill=\"red\" />\n"
+ "</svg>\n"
+ "</body>\n"
+ "</html>";
new HtmlCleaner().clean(html);
html = "<!DOCTYPE html>\n"
+ "<html lang=\"en\">\n"
+ "<head>\n"
+ "</head>\n"
+ "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
+ "<svg xmlns=\"http://www.w3.org/2000/svg\">\n"
+ " <svg></svg>\n"
+ "</svg>\n"
+ "</body>\n"
+ "</html>";
new HtmlCleaner().clean(html);
}
@Test
public void svgCloseAssumedNS4() throws Exception{
String html="<html><head></head><body><svg><h3>Title</h3><div>text</div></body></html>";
CleanerProperties props = new CleanerProperties();
props.setNamespacesAware(true);
props.setOmitXmlDeclaration(true);
HtmlCleaner cleaner = new HtmlCleaner(props);
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
assertEquals("<html><head></head><body><svg></svg><h3>Title</h3><div>text</div></body></html>", cleaned);
}
@Test
@Ignore // This is a tricky one as "a" is allowed in SVG, so the rest is assumed to be OK.
public void svgCloseAssumedNS3() throws Exception{
String html="<html><head></head><body><svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
CleanerProperties props = new CleanerProperties();
props.setNamespacesAware(true);
props.setOmitXmlDeclaration(true);
HtmlCleaner cleaner = new HtmlCleaner(props);
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
assertEquals("<html><head></head><body><svg></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
}
@Test
public void svgCloseAssumedNS2() throws Exception{
String html="<html><head></head><body><svg><title></title></svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
CleanerProperties props = new CleanerProperties();
props.setNamespacesAware(true);
props.setOmitXmlDeclaration(true);
HtmlCleaner cleaner = new HtmlCleaner(props);
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
assertEquals("<html><head></head><body><svg><title></title></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
}
@Test
public void svgCloseAssumedNS() throws Exception{
String html="<html><head></head><body><svg></svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
CleanerProperties props = new CleanerProperties();
props.setNamespacesAware(true);
props.setOmitXmlDeclaration(true);
HtmlCleaner cleaner = new HtmlCleaner(props);
String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
assertEquals("<html><head></head><body><svg></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
}
@Test
public void missingSVGNamespace() throws IOException {
String initial = "<html><head><title>Title of document</title></head><body><svg><title>A big circle.</title></svg></body></html>";
String expected = "<html>\n<head><title>Title of document</title></head>\n<body><svg><title>A big circle.</title></svg></body></html>";
assertCleaned(initial, expected);
}
@Test
public void preserveSVGtags() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
cleaner.getProperties().setOmitDoctypeDeclaration(false);
cleaner.getProperties().setOmitUnknownTags(true);
cleaner.getProperties().setNamespacesAware(true);
String initial = readFile("src/test/resources/test18.html");
String expected = readFile("src/test/resources/test18_expected.html");
assertCleaned(initial,expected);
}
@Test
public void preserveSVGtags2() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
cleaner.getProperties().setOmitDoctypeDeclaration(false);
cleaner.getProperties().setOmitUnknownTags(true);
cleaner.getProperties().setNamespacesAware(true);
String initial = readFile("src/test/resources/test19.html");
String expected = readFile("src/test/resources/test19_expected.html");
assertCleaned(initial,expected);
}
@Test
public void preserveSVGtags3() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
cleaner.getProperties().setOmitDoctypeDeclaration(false);
cleaner.getProperties().setNamespacesAware(true);
String initial = readFile("src/test/resources/test20.html");
String expected = readFile("src/test/resources/test20_expected.html");
assertCleaned(initial,expected);
}
@Test
public void preserveSVGtagsWithTitle() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
cleaner.getProperties().setOmitDoctypeDeclaration(false);
cleaner.getProperties().setNamespacesAware(true);
cleaner.getProperties().setOmitUnknownTags(true);
String initial = readFile("src/test/resources/test21.html");
String expected = readFile("src/test/resources/test21_expected.html");
assertCleaned(initial,expected);
}
@Test
public void preserveSVGstylesInPlace() throws IOException{
cleaner.getProperties().setOmitXmlDeclaration(false);
cleaner.getProperties().setOmitDoctypeDeclaration(false);
cleaner.getProperties().setNamespacesAware(true);
cleaner.getProperties().setOmitUnknownTags(true);
String initial = readFile("src/test/resources/test25.html");
String expected = readFile("src/test/resources/test25_expected.html");
assertCleaned(initial,expected);
}
}
@@ -0,0 +1,77 @@
package org.htmlcleaner;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
/**
* Tests for some common use of <script> tags within <head> elements
* @author scottw
*
*/
public class ScriptTest extends AbstractHtmlCleanerTest {
@Test
public void another() throws IOException{
HtmlCleaner htmlCleaner = new HtmlCleaner();
CleanerProperties props = htmlCleaner.getProperties();
props.setAllowHtmlInsideAttributes(true);
props.setAllowMultiWordAttributes(true);
props.setRecognizeUnicodeChars(true);
props.setOmitComments(true);
TagNode rootNode = htmlCleaner.clean(new File("src/test/resources/script_test.html"));
}
@Test
public void getScripts() throws IOException{
HtmlCleaner cleaner = new HtmlCleaner();
TagNode html = cleaner.clean( new File("src/test/resources/script_test.html") );
TagNode head = html.findElementByName("head", false);
ArrayList<TagNode> scripts = new ArrayList<TagNode>();
List<TagNode> children = head.getChildTagList();
for(TagNode child : children){
if(child.getName().equals("script")){
scripts.add(child);
}
}
assertEquals(3, scripts.size());
assertEquals("x.js", scripts.get(0).getAttributeByName("src"));
assertEquals("y.js", scripts.get(1).getAttributeByName("src"));
assertEquals("z.js", scripts.get(2).getAttributeByName("src"));
}
@Test
public void scriptAttribute() throws IOException{
cleaner.getProperties().setUseCdataForScriptAndStyle(true);
String initial = "<button onclick='aaa(\"bbb\")'>Click here!</button>";
String expected ="<html>\n<head />\n<body><button onclick=\"aaa(&quot;bbb&quot;)\">Click here!</button></body></html>";
assertCleaned(initial, expected);
}
/*
* Test for issue #88 - thanks to Serge Dyomin
*/
@Test
public void scriptAttributeQuotes() throws IOException{
HtmlCleaner thecleaner=new HtmlCleaner();
CleanerProperties props = thecleaner.getProperties();
props.setOmitXmlDeclaration(true);
props.setOmitComments(false);
props.setTranslateSpecialEntities(true);
String initial = readFile("src/test/resources/test16.html");
String expected = readFile("src/test/resources/test16_expected.html");
String output = new SimpleHtmlSerializer(thecleaner.getProperties()).getAsString(thecleaner.clean(initial));
assertEquals(expected,output);
}
}

Some files were not shown because too many files have changed in this diff Show More