Merge pull request 'merge version 1.' (#1) from master into main

Reviewed-on: https://src.isharkfly.com/honeymoose/HtmlCleaner/pulls/1
2025-04-24 15:36:02 +00:00
parent cf604ae174 304b2cdada
commit d79bfb52ff
182 changed files with 36298 additions and 1 deletions
@@ -0,0 +1,12 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Environment-dependent path to Maven home directory
+/mavenHomeManager.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Zeppelin ignored files
+/ZeppelinRemoteNotebooks/
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CheckStyle-IDEA" serialisationVersion="2">
+    <checkstyleVersion>10.23.0</checkstyleVersion>
+    <scanScope>JavaOnly</scanScope>
+    <copyLibs>true</copyLibs>
+    <option name="thirdPartyClasspath" />
+    <option name="activeLocationIds" />
+    <option name="locations">
+      <list>
+        <ConfigurationLocation id="bundled-sun-checks" type="BUNDLED" scope="All" description="Sun Checks">(bundled)</ConfigurationLocation>
+        <ConfigurationLocation id="bundled-google-checks" type="BUNDLED" scope="All" description="Google Checks">(bundled)</ConfigurationLocation>
+      </list>
+    </option>
+  </component>
+</project>
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CompilerConfiguration">
+    <annotationProcessing>
+      <profile name="Maven default annotation processors profile" enabled="true">
+        <sourceOutputDir name="target/generated-sources/annotations" />
+        <sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
+        <outputRelativeToContentRoot value="true" />
+        <module name="htmlcleaner" />
+      </profile>
+    </annotationProcessing>
+  </component>
+</project>
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding">
+    <file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
+    <file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
+  </component>
+</project>
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="RemoteRepositoriesConfiguration">
+    <remote-repository>
+      <option name="id" value="sonatype-nexus-snapshots" />
+      <option name="name" value="Sonatype Nexus Snapshots" />
+      <option name="url" value="https://oss.sonatype.org/content/repositories/snapshots" />
+    </remote-repository>
+    <remote-repository>
+      <option name="id" value="ossez-repo-releases" />
+      <option name="name" value="iSharkFly Private Releases" />
+      <option name="url" value="https://repo.isharkfly.com/repository/isharkfly-maven-releases/" />
+    </remote-repository>
+    <remote-repository>
+      <option name="id" value="central" />
+      <option name="name" value="Central Repository" />
+      <option name="url" value="https://repo.isharkfly.com/repository/maven/" />
+    </remote-repository>
+    <remote-repository>
+      <option name="id" value="ossez-repo-snapshots" />
+      <option name="name" value="iSharkFly Private Snapshots" />
+      <option name="url" value="https://repo.isharkfly.com/repository/isharkfly-maven-snapshots/" />
+    </remote-repository>
+    <remote-repository>
+      <option name="id" value="central" />
+      <option name="name" value="Maven Central repository" />
+      <option name="url" value="https://repo1.maven.org/maven2" />
+    </remote-repository>
+    <remote-repository>
+      <option name="id" value="jboss.community" />
+      <option name="name" value="JBoss Community repository" />
+      <option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
+    </remote-repository>
+  </component>
+</project>
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
@@ -1,6 +1,6 @@
 HtmlCleaner is a project originally developed by Vladimir Nikic (http://htmlcleaner.sourceforge.net/).

-This version is modified by Zheng Sun.
+This version is modified by iSharkFly.

 Briefly speaking, the modifications are

@@ -0,0 +1,44 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+ 
+    Additional work by Amplafi. -- All rights released.
+ */
+package org.htmlcleaner;
+
+public interface AttributeTransformation {
+    boolean satisfy(String attName, String attValue);
+    String getTemplate();
+}
@@ -0,0 +1,72 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+ 
+    Additional work by Amplafi. -- All rights released.
+ */
+package org.htmlcleaner;
+
+import java.util.regex.Pattern;
+
+public class AttributeTransformationPatternImpl implements AttributeTransformation {
+    private final Pattern attNamePattern;
+    private final Pattern attValuePattern;
+    private final String template;
+    public AttributeTransformationPatternImpl(Pattern attNamePattern, Pattern attValuePattern, String template) {
+        this.attNamePattern = attNamePattern;
+        this.attValuePattern = attValuePattern;
+        this.template = template;
+    }
+    public AttributeTransformationPatternImpl(String attNamePattern, String attValuePattern, String template) {
+        this.attNamePattern = attNamePattern ==null?null:Pattern.compile(attNamePattern);
+        this.attValuePattern = attValuePattern == null? null: Pattern.compile(attValuePattern);
+        this.template = template;
+    }
+
+    public boolean satisfy(String attName, String attValue) {
+        if ( (attNamePattern == null || attNamePattern.matcher(attName).find()) && (attValuePattern ==null || attValuePattern.matcher(attValue).find())){
+            return true;
+        } else {
+            return false;
+        }
+    }
+
+    /**
+     * @return the template
+     */
+    public String getTemplate() {
+        return template;
+    }
+}
@@ -0,0 +1,38 @@
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.List;
+
+public class BaseHtmlNode extends BaseTokenImpl implements HtmlNode {
+	
+    protected TagNode parent;
+
+    public List<? extends BaseToken> getSiblings(){
+    	//
+    	// If this is a root node, return an empty list
+    	//
+    	if (this.parent == null) { return new ArrayList<BaseToken>(); };
+    	//
+    	// Otherwise, return all the children, including this node
+    	//
+    	return this.parent.getAllChildren();
+    }
+
+	public TagNode getParent() {
+		return parent;
+	}
+
+	public void setParent(TagNode parent) {
+		this.parent = parent;
+	}
+
+	public void serialize(Serializer serializer, Writer writer)
+			throws IOException {
+		// TODO Auto-generated method stub
+	}
+    
+    
+
+}
@@ -0,0 +1,72 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * <p>
+ * 	Base token interface. Tokens are individual entities recognized by HTML parser.
+ * </p>
+ */
+public interface BaseToken {
+
+	public void serialize(Serializer serializer, Writer writer) throws IOException;
+
+	/**
+	 * @return row in source html where the token was found
+	 */
+	public int getRow();
+
+	/**
+	 * @param row
+	 */
+	public void setRow(int row);
+
+	/**
+	 * @return col in source html where the token was found
+	 */
+	public int getCol();
+
+	/**
+	 * @param col
+	 */
+	public void setCol(int col);
+
+}
@@ -0,0 +1,40 @@
+package org.htmlcleaner;
+
+/**
+ * Base class for all tokens. Allows position tracking.
+ *
+ * @author Konstantin Burov (aectann@gmail.com)
+ *
+ */
+public abstract class BaseTokenImpl implements BaseToken {
+
+    private int row;
+    private int col;
+    
+    protected BaseTokenImpl(){
+    	
+    }
+
+    protected BaseTokenImpl(int row, int col) {
+        this.row = row;
+        this.col = col;
+    }
+    public int getRow() {
+        return row;
+    }
+    public void setRow(int row) {
+        this.row = row;
+    }
+    public int getCol() {
+        return col;
+    }
+    public void setCol(int col) {
+        this.col = col;
+    }
+
+    @Override
+    public String toString() {
+        return "(line="+getRow()+", col="+getCol()+")";
+    }
+
+}
@@ -0,0 +1,74 @@
+/*  
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+package org.htmlcleaner;
+
+/**
+ * @author patmoore
+ *
+ */
+public enum BelongsTo {
+
+    HEAD_AND_BODY("all"),
+    HEAD("head"),
+    BODY("body");
+    private final String dbCode;
+    private BelongsTo(String dbCode) {
+        this.dbCode =dbCode;
+    }
+    
+    /**
+     * @return the dbCode
+     */
+    public String getDbCode() {
+        return dbCode;
+    }
+    
+    public static BelongsTo toValue(Object value) {
+        BelongsTo result = null;
+        if ( value instanceof BelongsTo) {
+            result = (BelongsTo) value;
+        } else if ( value != null ) {
+            String dbCode = value.toString().trim(); 
+            for(BelongsTo belongsTo: BelongsTo.values()) {
+                if ( belongsTo.getDbCode().equalsIgnoreCase(dbCode) || belongsTo.name().equalsIgnoreCase(dbCode)) {
+                    result = belongsTo;
+                    break;
+                }
+            }
+        }
+        
+        return result;
+    }
+}
@@ -0,0 +1,152 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.StringTokenizer;
+
+/**
+ * <p>
+ *  Browser compact XML serializer - creates resulting XML by stripping whitespaces wherever possible,
+ *  but preserving single whitespace where at least one exists. This behaviour is well suited
+ *  for web-browsers, which usually treat multiple whitespaces as single one, but make difference
+ *  between single whitespace and empty text.
+ * </p>
+ */
+public class BrowserCompactXmlSerializer extends XmlSerializer {
+
+    private static final String PRE_TAG = "pre";
+    private static final String BR_TAG = "<br />";
+    private static final String LINE_BREAK = "\n";
+
+    public BrowserCompactXmlSerializer(CleanerProperties props) {
+        super(props);
+    }
+
+    @Override
+    protected void serialize(TagNode tagNode, Writer writer) throws IOException {
+        serializeOpenTag(tagNode, writer, false);
+        TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
+        String tagName = tagInfo!=null? tagInfo.getName() : null;
+        List<? extends BaseToken> tagChildren = new ArrayList<BaseToken>(tagNode.getAllChildren());
+        if (!isMinimizedTagSyntax(tagNode)) {
+            ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
+            while (childrenIt.hasNext()) {
+                Object item = childrenIt.next();
+                if (item != null) {
+                    if (item instanceof ContentNode && !PRE_TAG.equals(tagName)) {
+                        String content = ((ContentNode) item).getContent();
+                        content = dontEscape(tagNode) ? content.replaceAll("]]>", "]]&gt;") : escapeXml(content);
+                        content = content.replaceAll("^"+SpecialEntities.NON_BREAKABLE_SPACE+"+", " ");
+                        content = content.replaceAll(SpecialEntities.NON_BREAKABLE_SPACE+"+$", " ");
+                        boolean whitespaceAllowed = tagInfo != null && tagInfo.getDisplay().isLeadingAndEndWhitespacesAllowed();
+                        boolean writeLeadingSpace = content.length() > 0 && (Character.isWhitespace(content.charAt(0)));
+                        boolean writeEndingSpace = content.length() > 1 && Character.isWhitespace(content.charAt(content.length() - 1));
+                        content = content.trim();
+                        if (content.length() != 0) {
+                            boolean hasPrevContent = false;
+                            int order = tagChildren.indexOf(item);
+                            if (order >= 2) {
+                                Object prev = tagChildren.get(order-1);
+                                hasPrevContent = isContentOrInline(prev);
+                            }
+
+                            if (writeLeadingSpace && (whitespaceAllowed || hasPrevContent)) {
+                                writer.write(' ');
+                            }
+
+                            StringTokenizer tokenizer = new StringTokenizer(content, LINE_BREAK, true);
+                            String prevToken = "";
+                            while (tokenizer.hasMoreTokens()) {
+                                String token = tokenizer.nextToken();
+                                if (prevToken.equals(token) && prevToken.equals(LINE_BREAK)) {
+                                    writer.write(BR_TAG);
+                                    prevToken = "";
+                                } else if (LINE_BREAK.equals(token)) {
+                                    writer.write(' ');
+                                } else {
+                                    writer.write(token.trim());
+                                }
+                                prevToken = token;
+                            }
+
+                            boolean hasFollowingContent = false;
+                            if (childrenIt.hasNext()) {
+                                Object next = childrenIt.next();
+                                hasFollowingContent = isContentOrInline(next);
+                                childrenIt.previous();
+                            }
+
+                            if (writeEndingSpace && (whitespaceAllowed || hasFollowingContent)) {
+                                writer.write(' ');
+                            }
+                        } else{
+                            childrenIt.remove();
+                        }
+                    } else if(item instanceof ContentNode){
+                        String content = ((ContentNode) item).getContent();
+                        writer.write(content);
+                    } else if (item instanceof CommentNode) {
+                    	String content = ((CommentNode) item).getCommentedContent().trim();
+                    	writer.write(content);
+                    } else {
+                    	((BaseToken)item).serialize(this, writer);
+                    }
+                }
+            }
+
+            serializeEndTag(tagNode, writer, tagInfo != null && tagInfo.getDisplay().isAfterTagLineBreakNeeded());
+        }
+    }
+
+    private boolean isContentOrInline(Object node) {
+        boolean result = false;
+        if (node instanceof ContentNode) {
+            result = true;
+        } else if (node instanceof TagNode) {
+            TagInfo nextInfo = props.getTagInfoProvider().getTagInfo(((TagNode) node).getName());
+            result = nextInfo != null && nextInfo.getDisplay() == Display.inline;
+        }
+        return result;
+    }
+
+}
@@ -0,0 +1,75 @@
+/*  Copyright (c) 2006-2013, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+package org.htmlcleaner;
+
+public class CData extends ContentNode implements HtmlNode {
+	
+    public static final String BEGIN_CDATA = "<![CDATA[";
+    public static final String END_CDATA = "]]>";
+    public static final String SAFE_BEGIN_CDATA = "/*<![CDATA[*/";
+    public static final String SAFE_END_CDATA = "/*]]>*/";
+    public static final String SAFE_BEGIN_CDATA_ALT = "//<![CDATA[";
+    public static final String SAFE_END_CDATA_ALT = "//]]>";
+	
+	public CData(String content){
+		super(content);	
+	}
+	
+	public String getContentWithoutStartAndEndTokens(){
+		return this.content;
+	}
+
+	/* (non-Javadoc)
+	 * @see org.htmlcleaner.ContentNode#getContent()
+	 */
+	@Override
+	public String getContent() {
+		return getContentWithoutStartAndEndTokens();
+	}
+
+	/* (non-Javadoc)
+	 * @see org.htmlcleaner.ContentNode#toString()
+	 */
+	@Override
+	public String toString() {
+		return getContentWithStartAndEndTokens();
+	}
+	
+	public String getContentWithStartAndEndTokens(){
+		return SAFE_BEGIN_CDATA + this.content + SAFE_END_CDATA;
+	}
+	
+	
+	
+	
+}
@@ -0,0 +1,54 @@
+package org.htmlcleaner;
+
+import java.util.Stack;
+
+/**
+ * Contains information about nodes that were closed due to their child nodes.
+ * i.e. if 'p' tag was closed due to 'table' child tag.
+ *
+ * @author Konstantin Burov
+ *
+ */
+class ChildBreaks{
+	Stack < TagPos> closedByChildBreak = new Stack < TagPos >();
+	private Stack < TagPos > breakingTags = new Stack < TagPos >();
+
+	/**
+	 * Adds the break info to the top of the stacks.
+	 *
+	 * @param closedPos - position of the tag that was closed due to incorrect child
+	 * @param breakPos - position of the child that has broken its parent
+	 */
+	public void addBreak(TagPos closedPos, TagPos breakPos){
+		closedByChildBreak.add(closedPos);
+		breakingTags.add(breakPos);
+	}
+
+	public boolean isEmpty() {
+		return closedByChildBreak.isEmpty();
+	}
+
+	/**
+	 * @return name of the last children tag that has broken its parent.
+	 */
+	public String getLastBreakingTag() {
+		return breakingTags.peek().name;
+	}
+
+	/**
+	 * pops out latest broken tag position.
+	 *
+	 * @return tag pos of the last parent that was broken.
+	 */
+	public TagPos pop() {
+		breakingTags.pop();
+		return closedByChildBreak.pop();
+	}
+
+	/**
+	 * @return position of the last tag that has broken its parent. -1 if no such tag found.
+	 */
+	public int getLastBreakingTagPosition() {
+		return breakingTags.isEmpty()?-1:breakingTags.peek().position;
+	}
+}
@@ -0,0 +1,80 @@
+/*  Copyright (c) 2006-2013, HtmlCleaner Team (Vladimir Nikic, Pat Moore, Scott Wilson)
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+*/
+package org.htmlcleaner;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Set;
+import java.util.Stack;
+import java.util.TreeSet;
+
+import org.htmlcleaner.conditional.ITagNodeCondition;
+
+/**
+ * This class is for thread-safe handling of private instance variables from HtmlCleaner
+ */
+class CleanTimeValues {
+	
+    boolean _headOpened = false;
+    boolean _bodyOpened = false;
+    @SuppressWarnings("rawtypes")
+	Set _headTags = new LinkedHashSet();
+    @SuppressWarnings("rawtypes")
+	Set allTags = new TreeSet();
+    transient Stack<NestingState> nestingStates = new Stack<NestingState>();
+
+    TagNode htmlNode;
+    TagNode bodyNode;
+    TagNode headNode;
+    TagNode rootNode;
+
+    Set<ITagNodeCondition> pruneTagSet = new HashSet<ITagNodeCondition>();
+    Set<TagNode> pruneNodeSet = new HashSet<TagNode>();
+    Set<ITagNodeCondition> allowTagSet;
+    
+    /**
+     * A stack of namespaces for currently open tags. Every xmlns declaration
+     * on a tag adds another namespace to the stack, which is removed when the
+     * tag is closed. In this way you can keep track of what namespace a tag
+     * belongs to.
+     */
+    transient Stack<String> namespace = new Stack<String>();
+    
+    /**
+     * A map of all the namespace prefixes and URIs declared within the document.
+     * We use this to check whether any prefixes remain undeclared.
+     */ 
+    transient HashMap<String, String> namespaceMap = new HashMap<String, String>();
+}
@@ -0,0 +1,665 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import org.htmlcleaner.audit.ErrorType;
+import org.htmlcleaner.audit.HtmlModificationListener;
+import org.htmlcleaner.conditional.ITagNodeCondition;
+import org.htmlcleaner.conditional.TagNodeAutoGeneratedCondition;
+import org.htmlcleaner.conditional.TagNodeNameCondition;
+
+/**
+ * Properties defining cleaner's behaviour
+ */
+public class CleanerProperties implements HtmlModificationListener{
+    // Force consistent cross-platform encoding ( mandatory for reliable server operation)
+    public static final String DEFAULT_CHARSET = "UTF-8";
+    public static final String BOOL_ATT_SELF = "self";
+    public static final String BOOL_ATT_EMPTY = "empty";
+    public static final String BOOL_ATT_TRUE = "true";
+
+    private ITagInfoProvider tagInfoProvider;
+    /**
+     * If this parameter is set to true, ampersand sign (&) that proceeds valid XML character sequences (&XXX;) will not be escaped with &amp;XXX;
+     */
+    private boolean advancedXmlEscape;
+    private String useCdataFor;
+    private List<String> useCdataForList;
+    private boolean translateSpecialEntities;
+    private boolean recognizeUnicodeChars;
+    private boolean omitUnknownTags;
+    private boolean treatUnknownTagsAsContent;
+    private boolean omitDeprecatedTags;
+    private boolean omitComments;
+    private boolean treatDeprecatedTagsAsContent;
+    private OptionalOutput omitXmlDeclaration;
+    private OptionalOutput omitDoctypeDeclaration;
+    private OptionalOutput omitHtmlEnvelope;
+    private boolean useEmptyElementTags;
+    private boolean allowMultiWordAttributes;
+    private String booleanAttributeValues;
+    private boolean ignoreQuestAndExclam;
+    private boolean allowHtmlInsideAttributes;
+    private boolean namespacesAware;
+    private boolean transSpecialEntitiesToNCR;
+    private boolean omitCdataOutsideScriptAndStyle;
+    private boolean deserializeEntities;
+    private boolean trimAttributeValues;
+    private int htmlVersion;
+
+    private boolean allowInvalidAttributeNames;
+    private String invalidAttributeNamePrefix;
+
+    /**
+     * Provides an arbitrary recursion depth
+     */
+    private int maxDepth;
+    public int getMaxDepth() {
+        return maxDepth;
+    }
+    public void setMaxDepth(int maxDepth) {
+        this.maxDepth = maxDepth;
+    }
+    
+    /**
+     * "cause the cleaner cannot keep track of whitespace at that level",
+     * there are 2 lists built: one for the head , one for the body. So whitespace that falls outside of the head and body is not preserved
+     * this creates at least a newline break.
+     *
+     * More work than really wanted at this point to "preserve" the whitespace.
+     */
+    private boolean addNewlineToHeadAndBody;
+    /**
+     * Tries to keep inside head all whitespace and comments that were originally there
+     */
+    private boolean keepWhitespaceAndCommentsInHead;
+    private String hyphenReplacementInComment;
+    // comma separate list of tags pruned.
+    private String pruneTags;
+    // comma separate list of tags allowed.
+    private String allowTags;
+
+    private CleanerTransformations cleanerTransformations = new CleanerTransformations();
+
+    private List < HtmlModificationListener > htmlModificationListeners;
+
+    /**
+     * blacklist of tags
+     */
+    private Set<ITagNodeCondition> pruneTagSet = new HashSet<ITagNodeCondition>();
+    /**
+     * the list of allowed tags (whitelist approach v. blacklist approach of pruneTags )
+     */
+    private Set<ITagNodeCondition> allowTagSet = new HashSet<ITagNodeCondition>();
+    private String charset = DEFAULT_CHARSET;
+    private boolean transResCharsToNCR;
+
+    public CleanerProperties() {
+        reset();
+    }
+
+    /**
+     * @param tagInfoProvider
+     */
+    public CleanerProperties(ITagInfoProvider tagInfoProvider) {
+        reset();
+        this.tagInfoProvider = tagInfoProvider;
+    }
+
+    /**
+     * @param tagInfoProvider the tagInfoProvider to set
+     */
+    void setTagInfoProvider(ITagInfoProvider tagInfoProvider) {
+        this.tagInfoProvider = tagInfoProvider;
+    }
+
+    public ITagInfoProvider getTagInfoProvider() {
+        return tagInfoProvider;
+    }
+
+    public boolean isAdvancedXmlEscape() {
+        return advancedXmlEscape;
+    }
+
+    public void setAdvancedXmlEscape(boolean advancedXmlEscape) {
+        this.advancedXmlEscape = advancedXmlEscape;
+    }
+
+    public boolean isTransResCharsToNCR() {
+        return transResCharsToNCR;
+    }
+
+    public void setTransResCharsToNCR(boolean transResCharsToNCR) {
+        this.transResCharsToNCR = transResCharsToNCR;
+    }
+
+    public boolean isUseCdataForScriptAndStyle() {
+    	return isUseCdataFor("script") && isUseCdataFor("style");
+    }
+
+    public void setUseCdataForScriptAndStyle(boolean useCdataForScriptAndStyle) {
+    	if (useCdataForScriptAndStyle)
+    		setUseCdataFor("script,style");
+    	else
+    		setUseCdataFor("");
+    }
+    
+    public void setUseCdataFor(String useCdataFor) {
+    	if (useCdataFor != null) {
+    		this.useCdataFor = useCdataFor;
+    		this.useCdataForList = Arrays.asList(useCdataFor.toLowerCase().split(","));
+    	} else {
+    		this.useCdataFor = "";
+    		this.useCdataForList = null;
+    	}
+    }
+
+
+    
+    public String getUseCdataFor() {
+    	return this.useCdataFor;
+    }
+    
+    public boolean isUseCdataFor(String useCdataFor) {
+    	if (useCdataForList != null && useCdataFor != null)
+    		return useCdataForList.contains(useCdataFor.toLowerCase());
+    	else
+    		return false;
+    }
+
+    public boolean isTranslateSpecialEntities() {
+        return translateSpecialEntities;
+    }
+
+    /**
+     * TODO : use {@link OptionalOutput}
+     * @param translateSpecialEntities
+     */
+    public void setTranslateSpecialEntities(boolean translateSpecialEntities) {
+        this.translateSpecialEntities = translateSpecialEntities;
+    }
+
+    public boolean isRecognizeUnicodeChars() {
+        return recognizeUnicodeChars;
+    }
+
+    public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) {
+        this.recognizeUnicodeChars = recognizeUnicodeChars;
+    }
+
+    public boolean isOmitUnknownTags() {
+        return omitUnknownTags;
+    }
+
+    public void setOmitUnknownTags(boolean omitUnknownTags) {
+        this.omitUnknownTags = omitUnknownTags;
+    }
+
+    public boolean isTreatUnknownTagsAsContent() {
+        return treatUnknownTagsAsContent;
+    }
+
+    public void setTreatUnknownTagsAsContent(boolean treatUnknownTagsAsContent) {
+        this.treatUnknownTagsAsContent = treatUnknownTagsAsContent;
+    }
+
+    public boolean isOmitDeprecatedTags() {
+        return omitDeprecatedTags;
+    }
+
+    public void setOmitDeprecatedTags(boolean omitDeprecatedTags) {
+        this.omitDeprecatedTags = omitDeprecatedTags;
+    }
+
+    public boolean isTreatDeprecatedTagsAsContent() {
+        return treatDeprecatedTagsAsContent;
+    }
+
+    public void setTreatDeprecatedTagsAsContent(boolean treatDeprecatedTagsAsContent) {
+        this.treatDeprecatedTagsAsContent = treatDeprecatedTagsAsContent;
+    }
+
+    public boolean isOmitComments() {
+        return omitComments;
+    }
+
+    public void setOmitComments(boolean omitComments) {
+        this.omitComments = omitComments;
+    }
+
+    public boolean isOmitXmlDeclaration() {
+        return omitXmlDeclaration == OptionalOutput.omit;
+    }
+
+    public void setOmitXmlDeclaration(boolean omitXmlDeclaration) {
+        this.omitXmlDeclaration = omitXmlDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput;
+    }
+
+    /**
+     *
+     * @return also return true if omitting the Html Envelope
+     */
+    public boolean isOmitDoctypeDeclaration() {
+        return omitDoctypeDeclaration == OptionalOutput.omit || isOmitHtmlEnvelope();
+    }
+
+    public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) {
+        this.omitDoctypeDeclaration = omitDoctypeDeclaration?OptionalOutput.omit:OptionalOutput.alwaysOutput;
+    }
+
+    public boolean isOmitHtmlEnvelope() {
+        return omitHtmlEnvelope == OptionalOutput.omit;
+    }
+
+    public void setOmitHtmlEnvelope(boolean omitHtmlEnvelope) {
+        this.omitHtmlEnvelope = omitHtmlEnvelope?OptionalOutput.omit:OptionalOutput.alwaysOutput;
+    }
+
+    public boolean isUseEmptyElementTags() {
+        return useEmptyElementTags;
+    }
+
+    public void setUseEmptyElementTags(boolean useEmptyElementTags) {
+        this.useEmptyElementTags = useEmptyElementTags;
+    }
+
+    public boolean isAllowMultiWordAttributes() {
+        return allowMultiWordAttributes;
+    }
+
+    public void setAllowMultiWordAttributes(boolean allowMultiWordAttributes) {
+        this.allowMultiWordAttributes = allowMultiWordAttributes;
+    }
+
+    public boolean isAllowHtmlInsideAttributes() {
+        return allowHtmlInsideAttributes;
+    }
+
+    public void setAllowHtmlInsideAttributes(boolean allowHtmlInsideAttributes) {
+        this.allowHtmlInsideAttributes = allowHtmlInsideAttributes;
+    }
+
+    public boolean isIgnoreQuestAndExclam() {
+        return ignoreQuestAndExclam;
+    }
+
+    public void setIgnoreQuestAndExclam(boolean ignoreQuestAndExclam) {
+        this.ignoreQuestAndExclam = ignoreQuestAndExclam;
+    }
+
+    public boolean isNamespacesAware() {
+        return namespacesAware;
+    }
+
+    public void setNamespacesAware(boolean namespacesAware) {
+        this.namespacesAware = namespacesAware;
+    }
+
+    public boolean isAddNewlineToHeadAndBody() {
+        return addNewlineToHeadAndBody;
+    }
+
+    public void setAddNewlineToHeadAndBody(boolean addNewlineToHeadAndBody) {
+        this.addNewlineToHeadAndBody = addNewlineToHeadAndBody;
+    }
+
+    public boolean isKeepWhitespaceAndCommentsInHead() {
+        return keepWhitespaceAndCommentsInHead;
+    }
+
+    public void setKeepWhitespaceAndCommentsInHead(boolean keepHeadWhitespace) {
+        this.keepWhitespaceAndCommentsInHead = keepHeadWhitespace;
+    }
+
+    public String getHyphenReplacementInComment() {
+        return hyphenReplacementInComment;
+    }
+
+    public void setHyphenReplacementInComment(String hyphenReplacementInComment) {
+        this.hyphenReplacementInComment = hyphenReplacementInComment;
+    }
+
+    public String getPruneTags() {
+        return pruneTags;
+    }
+    
+    public boolean isOmitCdataOutsideScriptAndStyle(){
+    	return omitCdataOutsideScriptAndStyle;
+    }
+    public void setOmitCdataOutsideScriptAndStyle(boolean value){
+    	omitCdataOutsideScriptAndStyle = value;
+    }
+
+    public boolean isDeserializeEntities() {
+        return deserializeEntities;
+    }
+
+    public void setDeserializeEntities(boolean deserializeEntities) {
+        this.deserializeEntities = deserializeEntities;
+    }
+    /**
+     * Sets the html version according to the parameter.Also,it sets the
+     * tag provider to the appropriate version.
+     * 
+     * @param version Number 4 for html4 or 5 for html5
+     */
+    public void setHtmlVersion(int version){
+    	this.htmlVersion=version;
+    	if (version==4)
+    		this.setTagInfoProvider(Html4TagProvider.INSTANCE);
+    	else
+    		this.setTagInfoProvider(Html5TagProvider.INSTANCE);
+    }
+    
+    /**
+     * Return the html version
+     * @return int The html version
+     */
+    public int getHtmlVersion (){
+    	return this.htmlVersion;
+    }
+
+    public boolean isTrimAttributeValues() {
+        return trimAttributeValues;
+    }
+
+    public void setTrimAttributeValues(boolean trimAttributeValues) {
+        this.trimAttributeValues = trimAttributeValues;
+    }
+    
+    /**
+     * Resets prune tags set and adds tag name conditions to it.
+     * All the tags listed by pruneTags param are added.
+     *
+     * @param pruneTags
+     */
+    public void setPruneTags(String pruneTags) {
+        this.pruneTags = pruneTags;
+        this.resetPruneTagSet();
+        this.addTagNameConditions(this.pruneTagSet, pruneTags);
+    }
+
+    /**
+     * Adds the condition to existing prune tag set.
+     *
+     * @param condition
+     */
+    public void addPruneTagNodeCondition(ITagNodeCondition condition){
+        pruneTagSet.add(condition);
+    }
+
+    public Set<ITagNodeCondition> getPruneTagSet() {
+        return pruneTagSet;
+    }
+
+    public String getAllowTags() {
+        return allowTags;
+    }
+
+    public void setAllowTags(String allowTags) {
+        this.allowTags = allowTags;
+        this.setAllowTagSet(allowTags);
+    }
+
+    private void setAllowTagSet(String allowTags) {
+        allowTagSet.clear();
+        addTagNameConditions(allowTagSet, allowTags);
+    }
+
+
+    public boolean isTransSpecialEntitiesToNCR() {
+        return transSpecialEntitiesToNCR;
+    }
+
+    public void setTransSpecialEntitiesToNCR(boolean transSpecialEntitiesToNCR) {
+        this.transSpecialEntitiesToNCR = transSpecialEntitiesToNCR;
+    }
+
+    /**
+     * @param tagSet
+     * @param tagsNameStr
+     */
+    private void addTagNameConditions(Set<ITagNodeCondition> tagSet, String tagsNameStr) {
+        if (tagsNameStr != null) {
+            StringTokenizer tokenizer = new StringTokenizer(tagsNameStr, ",");
+            while ( tokenizer.hasMoreTokens() ) {
+                tagSet.add( new TagNodeNameCondition(tokenizer.nextToken().trim().toLowerCase()) );
+            }
+        }
+    }
+
+    public Set<ITagNodeCondition> getAllowTagSet() {
+        return allowTagSet;
+    }
+
+    /**
+     * @param charset the charset to set
+     */
+    public void setCharset(String charset) {
+        this.charset = charset;
+    }
+
+    /**
+     * @return the charset
+     */
+    public String getCharset() {
+        return charset;
+    }
+
+    public String getBooleanAttributeValues() {
+        return booleanAttributeValues;
+    }
+
+    public void setBooleanAttributeValues(String booleanAttributeValues) {
+        if ( BOOL_ATT_SELF.equalsIgnoreCase(booleanAttributeValues) ||
+             BOOL_ATT_EMPTY.equalsIgnoreCase(booleanAttributeValues) ||
+             BOOL_ATT_TRUE.equalsIgnoreCase(booleanAttributeValues) ) {
+            this.booleanAttributeValues = booleanAttributeValues.toLowerCase();
+        } else {
+            this.booleanAttributeValues = BOOL_ATT_SELF;
+        }
+    }
+
+    /**
+     * advancedXmlEscape = true;
+     * setUseCdataFor("script,style");
+     * translateSpecialEntities = true;
+     * recognizeUnicodeChars = true;
+     * omitUnknownTags = false;
+     * treatUnknownTagsAsContent = false;
+     * omitDeprecatedTags = false;
+     * treatDeprecatedTagsAsContent = false;
+     * omitComments = false;
+     * omitXmlDeclaration = OptionalOutput.alwaysOutput;
+     * omitDoctypeDeclaration = OptionalOutput.alwaysOutput;
+     * omitHtmlEnvelope = OptionalOutput.alwaysOutput;
+     * useEmptyElementTags = true;
+     * allowMultiWordAttributes = true;
+     * allowHtmlInsideAttributes = false;
+     * ignoreQuestAndExclam = true;
+     * namespacesAware = true;
+     * keepHeadWhitespace = true;
+     * addNewlineToHeadAndBody = true;
+     * hyphenReplacementInComment = "=";
+     * pruneTags = null;
+     * allowTags = null;
+     * booleanAttributeValues = BOOL_ATT_SELF;
+     * collapseNullHtml = CollapseHtml.none
+     * charset = "UTF-8";
+     * trimAttributeValues = true;
+     * tagInfoProvider = HTML5TagProvider.INSTANCE
+     * maxDepth = 1000
+     */
+    public void reset() {
+        advancedXmlEscape = true;
+        setUseCdataFor("script,style");
+        translateSpecialEntities = true;
+        recognizeUnicodeChars = true;
+        omitUnknownTags = false;
+        treatUnknownTagsAsContent = false;
+        omitDeprecatedTags = false;
+        treatDeprecatedTagsAsContent = false;
+        omitComments = false;
+        omitXmlDeclaration = OptionalOutput.alwaysOutput;
+        omitDoctypeDeclaration = OptionalOutput.alwaysOutput;
+        omitHtmlEnvelope = OptionalOutput.alwaysOutput;
+        useEmptyElementTags = true;
+        allowMultiWordAttributes = true;
+        allowHtmlInsideAttributes = false;
+        ignoreQuestAndExclam = true;
+        namespacesAware = true;
+        addNewlineToHeadAndBody = true;
+        keepWhitespaceAndCommentsInHead = true;
+        hyphenReplacementInComment = "=";
+        setPruneTags(null);
+        setAllowTags(null);
+        booleanAttributeValues = BOOL_ATT_SELF;
+        charset = "UTF-8";
+        cleanerTransformations.clear();
+        resetPruneTagSet();
+        if (this.getHtmlVersion()==HtmlCleaner.HTML_4){
+        	tagInfoProvider = Html4TagProvider.INSTANCE;
+        }
+        else{
+        	tagInfoProvider = Html5TagProvider.INSTANCE;
+        }
+        htmlModificationListeners = new ArrayList < HtmlModificationListener >();
+        omitCdataOutsideScriptAndStyle = false;
+        trimAttributeValues = true;
+        invalidAttributeNamePrefix = "";
+        allowInvalidAttributeNames = false;
+        maxDepth = 1000;
+    }
+
+    private void resetPruneTagSet() {
+        pruneTagSet.clear();
+        pruneTagSet.add(TagNodeAutoGeneratedCondition.INSTANCE);
+    }
+
+    /**
+     * @return the cleanerTransformations
+     */
+    public CleanerTransformations getCleanerTransformations() {
+        return cleanerTransformations;
+    }
+
+    public void setCleanerTransformations(CleanerTransformations cleanerTransformations) {
+        if ( cleanerTransformations == null ) {
+            this.cleanerTransformations.clear();
+        } else {
+            this.cleanerTransformations = cleanerTransformations;
+        }
+    }
+
+    /**
+     * Adds a listener to the list of objects that will be notified about changes that
+     * cleaner does during cleanup process.
+     *
+     * @param listener -- listener object to be notified of the changes.
+     */
+    public void addHtmlModificationListener(HtmlModificationListener listener){
+        htmlModificationListeners.add(listener);
+    }
+
+    public void fireConditionModification(ITagNodeCondition condition, TagNode tagNode) {
+        for (HtmlModificationListener listener : htmlModificationListeners) {
+            listener.fireConditionModification(condition, tagNode);
+        }
+    }
+
+    public void fireHtmlError(boolean certainty, TagNode startTagToken, ErrorType type) {
+        for (HtmlModificationListener listener : htmlModificationListeners) {
+            listener.fireHtmlError(certainty, startTagToken, type);
+        }
+
+    }
+
+    public void fireUglyHtml(boolean certainty, TagNode startTagToken, ErrorType errorType) {
+        for (HtmlModificationListener listener : htmlModificationListeners) {
+            listener.fireUglyHtml(certainty, startTagToken, errorType);
+        }
+    }
+
+    public void fireUserDefinedModification(boolean certainty, TagNode tagNode, ErrorType errorType) {
+        for (HtmlModificationListener listener : htmlModificationListeners) {
+            listener.fireUserDefinedModification(certainty, tagNode, errorType);
+        }
+    }
+
+    /**
+     * Get the prefix to use to try to make valid attribute names
+     * @return invalidAttributeNamePrefix
+     */
+	public String getInvalidXmlAttributeNamePrefix() {
+		return invalidAttributeNamePrefix;
+	}
+
+	/**
+	 * Sets the prefix to use for xml attributes that are invalid
+	 * @param invalidXmlAttributePrefix the prefix to use
+	 */
+	public void setInvalidXmlAttributeNamePrefix(
+			String invalidXmlAttributePrefix) {
+		this.invalidAttributeNamePrefix = invalidXmlAttributePrefix;
+	}
+
+	/**
+	 * Set whether to allow invalid attribute names, or to try to fix or omit them
+	 * @param allowInvalidAttributeNames True if invalid attributes allowed
+	 */
+	public void setAllowInvalidAttributeNames(
+			boolean allowInvalidAttributeNames) {
+		this.allowInvalidAttributeNames = allowInvalidAttributeNames;
+	}
+
+	/**
+	 * If false, when outputting XML, if an attribute name is not valid, attempt to
+	 * fix it by using a prefix and removing invalid characters. Otherwise, omit invalid attributes
+	 * @return True if invalid attribute names are allowed.
+	 */
+	public boolean isAllowInvalidAttributeNames() {
+		return allowInvalidAttributeNames;
+	}
+}
@@ -0,0 +1,149 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * Contains transformation collection.
+ */
+public class CleanerTransformations { 
+
+    private Map mappings = new HashMap();
+    private TagTransformation globalTransformations=new TagTransformation(); 
+
+    public CleanerTransformations() {
+        
+    }
+    /**
+     * @param transInfos
+     */
+    public CleanerTransformations(Map transInfos) {
+        updateTagTransformations(transInfos);
+    }
+
+    /**
+     * Adds specified tag transformation to the collection.
+     * @param tagTransformation
+     */
+    public void addTransformation(TagTransformation tagTransformation) {
+        if (tagTransformation != null) {
+            mappings.put( tagTransformation.getSourceTag(), tagTransformation );
+        }
+    }
+    
+    public void addGlobalTransformation(AttributeTransformation attributeTransformation) {
+        globalTransformations.addAttributePatternTransformation(attributeTransformation);
+    }
+
+    public boolean hasTransformationForTag(String tagName)  {
+        return tagName != null && mappings.containsKey(tagName.toLowerCase());
+    }
+
+    public TagTransformation getTransformation(String tagName) {
+        return tagName != null ? (TagTransformation) mappings.get(tagName.toLowerCase()) : null; 
+    }
+
+    public void updateTagTransformations(String key, String value) {
+        int index = key.indexOf('.');
+    
+        // new tag transformation case (tagname[=destname[,preserveatts]])
+        if (index <= 0) {
+            String destTag = null;
+            boolean preserveSourceAtts = true;
+            if (value != null) {
+                String[] tokens = Utils.tokenize(value, ",;");
+                if (tokens.length > 0) {
+                    destTag = tokens[0];
+                }
+                if (tokens.length > 1) {
+                    preserveSourceAtts = "true".equalsIgnoreCase(tokens[1]) ||
+                                         "yes".equalsIgnoreCase(tokens[1]) ||
+                                         "1".equals(tokens[1]);
+                }
+            }
+            TagTransformation newTagTrans = new TagTransformation(key, destTag, preserveSourceAtts);
+            addTransformation(newTagTrans);
+        } else {    // attribute transformation description
+            String[] parts = Utils.tokenize(key, ".");
+            String tagName = parts[0];
+            TagTransformation trans = getTransformation(tagName);
+            if (trans != null) {
+                trans.addAttributeTransformation(parts[1], value);
+            }
+        }
+    }
+    public void updateTagTransformations(Map transInfos) {
+        Iterator iterator = transInfos.entrySet().iterator();
+        while (iterator.hasNext()) {
+            Map.Entry entry = (Map.Entry) iterator.next();
+            String tag = (String) entry.getKey();
+            String value = (String) entry.getValue();
+            updateTagTransformations(tag, value);
+        }
+    }
+    public Map<String, String> transformAttributes(String originalTagName, Map<String, String> attributes) {
+        TagTransformation tagTrans = getTransformation(originalTagName);
+        Map<String, String> results;
+        if ( tagTrans != null ) {
+            results = tagTrans.applyTagTransformations(attributes);
+        } else {
+            results = attributes;
+        }
+        return this.globalTransformations.applyTagTransformations(results);
+    }
+
+    public String getTagName(String tagName) {
+        TagTransformation tagTransformation = null;
+        if (hasTransformationForTag(tagName)) {
+            tagTransformation = getTransformation(tagName);
+            if (tagTransformation != null) {
+                return tagTransformation.getDestTag();
+            }
+        }
+        return tagName;
+    }
+    /**
+     * 
+     */
+    public void clear() {
+        this.mappings.clear();
+    }
+}
@@ -0,0 +1,83 @@
+/*
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+ 
+    Additional work by Amplafi. -- All rights released.
+ */
+package org.htmlcleaner;
+
+/**
+ * @author patmoore
+ *
+ */
+public enum CloseTag {
+    /**
+     * <div></div> is required. Minimizing to <div/> is not permitted.
+     */
+    required(false, true),
+    /**
+     * <hr> or <hr/> is permitted
+     */
+    optional(true, true),
+    /**
+     * <img/> is not permitted
+     */
+    forbidden(true, false);
+    private final boolean minimizedTagPermitted;
+    private final boolean endTagPermitted;
+    /**
+     *
+     * @param minimizedTagPermitted if true tag can be reduced to <x/>
+     * @param endTagPermitted TODO
+     */
+    private CloseTag(boolean minimizedTagPermitted, boolean endTagPermitted) {
+        this.minimizedTagPermitted = minimizedTagPermitted;
+        this.endTagPermitted =endTagPermitted;
+    }
+
+    /**
+     * @return true if <x/> form is allowed
+     */
+    public boolean isMinimizedTagPermitted() {
+        return this.minimizedTagPermitted;
+    }
+
+    /**
+     * @return true if <x/> or </x> is permitted.
+     */
+    public boolean isEndTagPermitted() {
+        return endTagPermitted;
+    }
+}
@@ -0,0 +1,384 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+	All rights reserved.
+
+	Redistribution and use of this software in source and binary forms,
+	with or without modification, are permitted provided that the following
+	conditions are met:
+
+	* Redistributions of source code must retain the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer.
+
+	* Redistributions in binary form must reproduce the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer in the documentation and/or other
+	  materials provided with the distribution.
+
+	* The name of HtmlCleaner may not be used to endorse or promote
+	  products derived from this software without specific prior
+	  written permission.
+
+	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+	POSSIBILITY OF SUCH DAMAGE.
+
+	You can contact Vladimir Nikic by sending e-mail to
+	nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+	subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.FileOutputStream;
+import java.net.URL;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import org.htmlcleaner.audit.HtmlModificationListenerLogger;
+
+/**
+ * <p>Command line usage class.</p>
+ */
+public class CommandLine {
+
+    private static final String OMITXMLDECL = "omitxmldecl";
+
+    /**
+     * If the specified argument name exists without a value, return true.
+     * If it exists with a value, translate it as a boolean.
+     * @param args the command line arguments
+     * @param name the switch name
+     * @return true, or false, depending on whether the switch has been specified
+     */
+    private static boolean getSwitchArgument(String[] args, String name){
+    	boolean value = false;
+    	for (String curr : args){
+    		int eqIndex = curr.indexOf('=');
+            if (eqIndex >= 0) {
+                String argName = curr.substring(0, eqIndex).trim();
+                String argValue = curr.substring(eqIndex+1).trim();
+                if (argName.toLowerCase().startsWith(name.toLowerCase())) {
+                    value = toBoolean(argValue);
+                }
+            } else {
+            	value = true;
+            }
+    	}
+    	return value;
+    }
+    
+    private static String getArgValue(String[] args, String name, String defaultValue) {
+        for (String curr : args) {
+            int eqIndex = curr.indexOf('=');
+            if (eqIndex >= 0) {
+                String argName = curr.substring(0, eqIndex).trim();
+                String argValue = curr.substring(eqIndex+1).trim();
+
+                if (argName.toLowerCase().startsWith(name.toLowerCase())) {
+                    return argValue;
+                }
+            }
+        }
+
+        return defaultValue;
+    }
+
+    private static boolean toBoolean(String s) {
+        return s != null && ( "on".equalsIgnoreCase(s) || "true".equalsIgnoreCase(s) || "yes".equalsIgnoreCase(s) );
+    }
+    
+    private final static String className = CommandLine.class.getName();
+    private final static Logger logger = Logger.getLogger(className);
+
+    public static void main(String[] args) throws IOException, XPatherException {
+        String source = getArgValue(args, "src", "");
+        Scanner scan = new Scanner(System.in);
+        String s = "";
+        
+        if ( "".equals(source) ) {
+        	 while (scan.hasNext()) {
+                 s += scan.nextLine();
+             }
+             if (s.compareTo("") != 0) {
+                 System.err.println("Output:");
+             } else {
+            System.err.println("Usage: java -jar htmlcleanerXX.jar src=<url | file> [htmlver=4] [incharset=<charset>] " +
+                               "[dest=<file>] [outcharset=<charset>] [taginfofile=<file>] [options...]");
+            System.err.println("Alternative: java -jar htmlcleanerXX.jar (reads the input from console)");
+            System.err.println("");
+            System.err.println("where options include:");
+            System.err.println("    outputtype=simple* | compact | browser-compact | pretty");
+            System.err.println("    advancedxmlescape=true* | false");
+            System.err.println("    usecdata=true* | false");
+            System.err.println("    usecdatafor=<string value> [script,style]");
+            System.err.println("    specialentities=true* | false");
+            System.err.println("    unicodechars=true* | false");
+            System.err.println("    omitunknowntags=true | false*");
+            System.err.println("    treatunknowntagsascontent=true | false*");
+            System.err.println("    omitdeprtags=true | false*");
+            System.err.println("    treatdeprtagsascontent=true | false*");
+            System.err.println("    omitcomments=true | false*");
+            System.err.println("    " +OMITXMLDECL +"=true* | false");
+            System.err.println("    omitdoctypedecl=true* | false");
+            System.err.println("    omithtmlenvelope=true | false*");
+            System.err.println("    useemptyelementtags=true* | false");
+            System.err.println("    allowmultiwordattributes=true* | false");
+            System.err.println("    allowhtmlinsideattributes=true | false*");
+            System.err.println("    ignoreqe=true | false*");
+            System.err.println("    namespacesaware=true* | false");
+            System.err.println("    hyphenreplacement=<string value> [=]");
+            System.err.println("    prunetags=<string value> []");
+            System.err.println("    booleanatts=self* | empty | true");
+            System.err.println("    nodebyxpath=<xpath expression>");
+            System.err.println("    allowinvalidxmlattributenames=true | false*");     
+            System.err.println("    invalidxmlattributenameprefix=<string value> []");     
+            System.err.println("    t:<sourcetagX>[=<desttag>[,<preserveatts>]]");
+            System.err.println("    t:<sourcetagX>.<destattrY>[=<template>]");
+            System.exit(1);
+             }
+        }
+
+        String inCharset = getArgValue(args, "incharset", "");
+        if ("".equals(inCharset)) {
+            inCharset = CleanerProperties.DEFAULT_CHARSET;
+        }
+
+        String outCharset = getArgValue(args, "outcharset", "");
+        if ("".equals(outCharset)) {
+            outCharset = CleanerProperties.DEFAULT_CHARSET;
+        }
+
+        String htmlversion = getArgValue(args, "htmlver", "");
+        String destination = getArgValue(args, "dest", "");
+        String outputType = getArgValue(args, "outputtype", "");
+        String advancedXmlEscape = getArgValue(args, "advancedxmlescape", "");
+        String useCData = getArgValue(args, "usecdata", "");
+        String useCDataFor = getArgValue(args, "usecdatafor", "");
+        String translateSpecialEntities = getArgValue(args, "specialentities", "");
+        String unicodeChars = getArgValue(args, "unicodechars", "");
+        String omitUnknownTags = getArgValue(args, "omitunknowntags", "");
+        String treatUnknownTagsAsContent = getArgValue(args, "treatunknowntagsascontent", "");
+        String omitDeprecatedTags = getArgValue(args, "omitdeprtags", "");
+        String treatDeprecatedTagsAsContent = getArgValue(args, "treatdeprtagsascontent", "");
+        String omitComments = getArgValue(args, "omitcomments", "");
+        String omitXmlDeclaration = getArgValue(args, OMITXMLDECL, "");
+        String omitDoctypeDeclaration = getArgValue(args, "omitdoctypedecl", "");
+        String omitHtmlEnvelope = getArgValue(args, "omithtmlenvelope", "");
+        String useEmptyElementTags = getArgValue(args, "useemptyelementtags", "");
+        String allowMultiWordAttributes = getArgValue(args, "allowmultiwordattributes", "");
+        String allowHtmlInsideAttributes = getArgValue(args, "allowhtmlinsideattributes", "");
+        String ignoreQuestAndExclam = getArgValue(args, "ignoreqe", "");
+        String namespacesAware= getArgValue(args, "namespacesaware", "");
+        String commentHyphen = getArgValue(args, "hyphenreplacement", "");
+        String pruneTags = getArgValue(args, "prunetags", "");
+        String booleanAtts = getArgValue(args, "booleanatts", "");
+        String nodeByXPath = getArgValue(args, "nodebyxpath", "");
+        
+        String allowInvalidAttributeNames = getArgValue(args, "allowinvalidattributenames", "");
+        String invalidXmlAttributeNamePrefix = getArgValue(args, "invalidxmlattributenameprefix", "");
+
+        HtmlCleaner cleaner;
+        	
+        String tagInfoFile = getArgValue(args, "taginfofile", "");
+        if ( !"".equals(tagInfoFile) ) {
+            cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(tagInfoFile)));
+        } else { //Set appropriate TagProvider
+        	if (htmlversion.compareTo("4")==0)
+        		cleaner = new HtmlCleaner(Html4TagProvider.INSTANCE);
+        	else
+        		cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE);
+        }
+
+        final CleanerProperties props = cleaner.getProperties();
+
+        //
+        // If the user specifies "quiet" or "quiet=true" then we don't add a modification
+        // listener
+        //
+        if (!getSwitchArgument(args, "quiet"))
+            props.addHtmlModificationListener(new HtmlModificationListenerLogger(logger));
+
+        if ( !"".equals(omitUnknownTags) ) {
+            props.setOmitUnknownTags( toBoolean(omitUnknownTags) );
+        }
+
+        if ( !"".equals(treatUnknownTagsAsContent) ) {
+            props.setTreatUnknownTagsAsContent( toBoolean(treatUnknownTagsAsContent) );
+        }
+
+        if ( !"".equals(omitDeprecatedTags) ) {
+            props.setOmitDeprecatedTags( toBoolean(omitDeprecatedTags) );
+        }
+
+        if ( !"".equals(treatDeprecatedTagsAsContent) ) {
+            props.setTreatDeprecatedTagsAsContent( toBoolean(treatDeprecatedTagsAsContent) );
+        }
+
+        if ( !"".equals(advancedXmlEscape) ) {
+            props.setAdvancedXmlEscape( toBoolean(advancedXmlEscape) );
+        }
+        
+        if ( !"".equals(useCData) && "".equals(useCDataFor) ) {
+            props.setUseCdataForScriptAndStyle( toBoolean(useCData) );
+        }
+        
+        if ( !"".equals(useCDataFor) ) {
+        	props.setUseCdataFor( useCDataFor );
+        }
+
+        if ( !"".equals(translateSpecialEntities) ) {
+            props.setTranslateSpecialEntities( toBoolean(translateSpecialEntities) );
+        }
+
+        if ( !"".equals(unicodeChars) ) {
+            props.setRecognizeUnicodeChars( toBoolean(unicodeChars) );
+        }
+
+        if ( !"".equals(omitComments) ) {
+            props.setOmitComments( toBoolean(omitComments) );
+        }
+
+        if ( !"".equals(omitXmlDeclaration) ) {
+            props.setOmitXmlDeclaration( toBoolean(omitXmlDeclaration) );
+        }
+
+        if ( !"".equals(omitDoctypeDeclaration) ) {
+        	props.setOmitDoctypeDeclaration( toBoolean(omitDoctypeDeclaration) );
+        }
+
+        if ( !"".equals(omitHtmlEnvelope) ) {
+        	props.setOmitHtmlEnvelope( toBoolean(omitHtmlEnvelope) );
+        }
+
+        if ( !"".equals(useEmptyElementTags) ) {
+        	props.setUseEmptyElementTags( toBoolean(useEmptyElementTags) );
+        }
+
+        if ( !"".equals(allowMultiWordAttributes) ) {
+        	props.setAllowMultiWordAttributes( toBoolean(allowMultiWordAttributes) );
+        }
+
+        if ( !"".equals(allowHtmlInsideAttributes) ) {
+        	props.setAllowHtmlInsideAttributes( toBoolean(allowHtmlInsideAttributes) );
+        }
+
+        if ( !"".equals(ignoreQuestAndExclam) ) {
+        	props.setIgnoreQuestAndExclam( toBoolean(ignoreQuestAndExclam) );
+        }
+
+        if ( !"".equals(namespacesAware) ) {
+        	props.setNamespacesAware( toBoolean(namespacesAware) );
+        }
+
+        if ( !"".equals(commentHyphen) ) {
+            props.setHyphenReplacementInComment(commentHyphen);
+        }
+
+        if ( !"".equals(pruneTags) ) {
+            props.setPruneTags(pruneTags);
+        }
+
+        if ( !"".equals(booleanAtts) ) {
+            props.setBooleanAttributeValues(booleanAtts);
+        }
+
+        if ( !"".equals(allowInvalidAttributeNames) ) {
+        	props.setAllowInvalidAttributeNames( toBoolean(allowInvalidAttributeNames) );
+        }
+        
+        if ( !"".equals(invalidXmlAttributeNamePrefix) ) {
+        	props.setInvalidXmlAttributeNamePrefix( invalidXmlAttributeNamePrefix );
+        }
+        
+        	
+        // collect transformation info
+        Map transInfos = new TreeMap();
+        for (String arg2 : args) {
+            String arg = arg2;
+            if (arg.startsWith("t:") && arg.length() > 2) {
+                arg = arg.substring(2);
+                int index = arg.indexOf('=');
+                String key = index <= 0 ? arg : arg.substring(0, index);
+                String value = index <= 0 ? null : arg.substring(index + 1);
+                transInfos.put(key, value);
+            }
+        }
+        cleaner.initCleanerTransformations(transInfos);
+
+        long start = System.currentTimeMillis();
+
+        TagNode node;
+
+        String src = source.toLowerCase();
+        
+        if (src.startsWith("http://") || src.startsWith("https://")) {
+            node = cleaner.clean(new URL(src), inCharset);
+        } else if (!source.isEmpty()) {
+            node = cleaner.clean(new File(source), inCharset);
+        } else {
+            node = cleaner.clean(s);
+        }
+
+        // if user specifies XPath expresssion to choose node for serialization, then
+        // try to evaluate XPath and look for first TagNode instance in the resulting array
+        if ( !"".equals(nodeByXPath) ) {
+            final Object[] xpathResult = node.evaluateXPath(nodeByXPath);
+            int i;
+            for (i = 0; i < xpathResult.length; i++) {
+                if ( xpathResult[i] instanceof TagNode ) {
+                    node = (TagNode) xpathResult[i];
+                    System.out.println("Node successfully found by XPath.");
+                    break;
+                }
+            }
+            if (i == xpathResult.length) {
+                System.out.println("Node not found by XPath expression - whole html tree is going to be serialized!");
+            }
+        }
+
+        OutputStream out;
+        if ( destination == null || "".equals(destination.trim()) ) {
+            out = System.out;
+        } else {
+            out = new FileOutputStream(destination);
+        }
+        
+        	
+
+        if ( "compact".equals(outputType) ) {
+            new CompactXmlSerializer(props).writeToStream(node, out, outCharset);
+        } else if ( "browser-compact".equals(outputType) ) {
+            new BrowserCompactXmlSerializer(props).writeToStream(node, out, outCharset);
+        } else if ( "pretty".equals(outputType) ) {
+            new PrettyXmlSerializer(props).writeToStream(node, out, outCharset);
+        } else if ( "htmlsimple".equals(outputType) ) {
+            new SimpleHtmlSerializer(props).writeToStream(node, out, outCharset); 
+        } else if ( "htmlpretty".equals(outputType) ) {
+            new PrettyHtmlSerializer(props).writeToStream(node, out, outCharset);
+        } else if ( "htmlcompact".equals(outputType) ) {
+            new CompactHtmlSerializer(props).writeToStream(node, out, outCharset);
+        } else {
+            new SimpleXmlSerializer(props).writeToStream(node, out, outCharset);
+        }
+        
+        if (!getSwitchArgument(args, "quiet")){
+        	System.out.println("Finished successfully in " + (System.currentTimeMillis() - start)+ "ms." );
+        }
+        
+        scan.close();
+    }
+
+}
@@ -0,0 +1,71 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * <p>HTML comment token.</p>
+ */
+public class CommentNode extends BaseHtmlNode implements HtmlNode {
+
+    private String content;
+
+    public CommentNode(String content) {
+        this.content = content;
+    }
+
+    public String getCommentedContent() {
+        return "<!--" + content + "-->";
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    @Override
+    public String toString() {
+        return getCommentedContent();
+    }
+
+    public void serialize(Serializer serializer, Writer writer) throws IOException {
+    	writer.write( getCommentedContent() );
+    }
+
+}
@@ -0,0 +1,111 @@
+/*  Copyright (c) 2006-20013, HtmlCleaner project
+    All rights reserved.
+	
+    Redistribution and use of this software in source and binary forms, 
+    with or without modification, are permitted provided that the following 
+    conditions are met:
+	
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+	
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+	
+    * The name of HtmlCleaner may not be used to endorse or promote 
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+    POSSIBILITY OF SUCH DAMAGE.
+	
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * <p>Compact HTML serializer - creates resulting HTML by stripping whitespaces wherever possible.</p>
+ */
+public class CompactHtmlSerializer extends HtmlSerializer {
+
+    private int openPreTags = 0;
+
+	public CompactHtmlSerializer(CleanerProperties props) {
+		super(props);
+	}
+
+    protected void serialize(TagNode tagNode, Writer writer) throws IOException {
+        boolean isPreTag = "pre".equalsIgnoreCase(tagNode.getName());
+        if (isPreTag) {
+            openPreTags++;
+        }
+
+        serializeOpenTag(tagNode, writer, false);
+
+        List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
+        if ( !isMinimizedTagSyntax(tagNode) ) {
+            ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
+            while ( childrenIt.hasNext() ) {
+                Object item = childrenIt.next();
+                if (item instanceof ContentNode) {
+                    String content = item.toString();
+                    if (openPreTags > 0) {
+                        writer.write(content);
+                    } else {
+                        boolean startsWithSpace = content.length() > 0 && Character.isWhitespace( content.charAt(0) );
+                        boolean endsWithSpace = content.length() > 1 && Character.isWhitespace( content.charAt(content.length() - 1) );
+                        content = dontEscape(tagNode) ? content.trim() : escapeText(content.trim());
+
+                        if (startsWithSpace) {
+                            writer.write(' ');
+                        }
+
+                        if (content.length() != 0) {
+                            writer.write(content);
+                            if (endsWithSpace) {
+                                writer.write(' ');
+                            }
+                        }
+
+                        //Removed due to issue #199
+                        //if (childrenIt.hasNext()) {
+                        //    if ( !Utils.isWhitespaceString(childrenIt.next()) ) {
+                        //        writer.write("\n");
+                        //    }
+                        //    childrenIt.previous();
+                        //}
+
+                    }
+                } else if (item instanceof CommentNode) {
+                    String content = ((CommentNode) item).getCommentedContent().trim();
+                    writer.write(content);
+                } else if (item instanceof BaseToken) {
+                    ((BaseToken)item).serialize(this, writer);
+                }
+            }
+
+            serializeEndTag(tagNode, writer, false);
+            if (isPreTag) {
+                openPreTags--;
+            }
+        }
+	}
+
+}
@@ -0,0 +1,98 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.*;
+
+/**
+ * <p>Compact XML serializer - creates resulting XML by stripping whitespaces.</p>
+ */
+public class CompactXmlSerializer extends XmlSerializer {
+
+	public CompactXmlSerializer(CleanerProperties props) {
+		super(props);
+	}
+
+    @Override
+    protected void serialize(TagNode tagNode, Writer writer) throws IOException {
+        serializeOpenTag(tagNode, writer, false);
+
+        List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
+        if ( !isMinimizedTagSyntax(tagNode) ) {
+            ListIterator<? extends BaseToken> childrenIt = tagChildren.listIterator();
+            while ( childrenIt.hasNext() ) {
+                Object item = childrenIt.next();
+                if (item != null) {
+                    if ( item instanceof ContentNode ) {
+                    	String content = ((ContentNode) item).getContent().trim();
+                        writer.write( dontEscape(tagNode) ? content.replaceAll("]]>", "]]&gt;") : escapeXml(content) );
+
+                        if (childrenIt.hasNext()) {
+                            if ( !isWhitespaceString(childrenIt.next()) ) {
+                                writer.write("\n");
+                            }
+                            childrenIt.previous();
+                        }
+                    } else if (item instanceof CommentNode) {
+                    	String content = ((CommentNode) item).getCommentedContent().trim();
+                    	writer.write(content);
+                    } else {
+                    	((BaseToken)item).serialize(this, writer);
+                    }
+                }
+            }
+
+            serializeEndTag(tagNode, writer, false);
+        }
+	}
+
+    /**
+     * Checks whether specified object's string representation is empty string (containing of only whitespaces).
+     * @param object Object whose string representation is checked
+     * @return true, if empty string, false otherwise
+     */
+    private boolean isWhitespaceString(Object object) {
+        if (object != null) {
+            String s = object.toString();
+            return s != null && "".equals(s.trim());
+        }
+        return false;
+    }
+}
@@ -0,0 +1,257 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import java.io.*;
+import java.util.HashMap;
+import java.util.Map;
+import java.net.URL;
+
+/**
+ * Configuration file tag provider - reads XML file in specified format and creates a Tag Provider.
+ * Used to create custom tag providers when used on the command line.
+ */
+public class ConfigFileTagProvider extends HashMap implements ITagInfoProvider {
+
+    // obtaining instance of the SAX parser factory
+    static SAXParserFactory parserFactory = SAXParserFactory.newInstance();
+    static {
+        parserFactory.setValidating(false);
+        parserFactory.setNamespaceAware(false);
+    }
+
+    // tells whether to generate code of the tag provider class based on XML configuration file
+    // to the standard output
+    private boolean generateCode = false;
+
+    private ConfigFileTagProvider() {
+    }
+
+    public ConfigFileTagProvider(InputSource inputSource) {
+        try {
+            new ConfigParser(this).parse(inputSource);
+        } catch (Exception e) {
+            throw new HtmlCleanerException("Error parsing tag configuration file!", e);
+        }
+    }
+
+    public ConfigFileTagProvider(File file) {
+        try {
+            new ConfigParser(this).parse(new InputSource(new FileReader(file)));
+        } catch (Exception e) {
+            throw new HtmlCleanerException("Error parsing tag configuration file!", e);
+        }
+    }
+
+    public ConfigFileTagProvider(URL url) {
+        try {
+            Object content = url.getContent();
+            if (content instanceof InputStream) {
+                InputStreamReader reader = new InputStreamReader((InputStream)content);
+                new ConfigParser(this).parse(new InputSource(reader));
+            }
+        } catch (Exception e) {
+            throw new HtmlCleanerException("Error parsing tag configuration file!", e);
+        }
+    }
+
+    public TagInfo getTagInfo(String tagName) {
+        return (TagInfo) get(tagName);
+    }
+
+    /**
+     * Generates code for tag provider class from specified configuration XML file.
+     * In order to create custom tag info provider, make config file and call this main method
+     * with the specified file. Output will be generated on the standard output. This way a custom
+     * tag provider (class CustomTagProvider) is generated from an XML file. An example XML file,
+     * "example.xml", can be found in the source distribution.
+     *
+     * @param args
+     * @throws IOException
+     * @throws SAXException
+     * @throws ParserConfigurationException
+     */
+    public static void main(String[] args) throws IOException, SAXException, ParserConfigurationException {
+        final ConfigFileTagProvider provider = new ConfigFileTagProvider();
+        provider.generateCode = true;
+        
+        String fileName = "default.xml";
+        if (args != null && args.length>0){
+        	fileName = args[0];
+        }
+
+        File configFile = new File(fileName);
+        String packagePath = "org.htmlcleaner";
+        String className = "CustomTagProvider";
+
+        final ConfigParser parser = provider.new ConfigParser(provider);
+        System.out.println("package " + packagePath + ";");
+        System.out.println("import java.util.HashMap;");
+        System.out.println("public class " + className + " extends HashMap implements ITagInfoProvider {");
+        System.out.println("private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();");
+        System.out.println("// singleton instance, used if no other TagInfoProvider is specified");
+        System.out.println("public final static "+className+" INSTANCE= new "+className+"();");
+        System.out.println("public " + className + "() {");
+        System.out.println("TagInfo tagInfo;");
+        parser.parse( new InputSource(new FileReader(configFile)) );
+        System.out.println("}");
+        System.out.println("}");
+    }
+
+
+    /**
+    * SAX parser for tag configuration files.
+    */
+    private class ConfigParser extends DefaultHandler {
+        private TagInfo tagInfo = null;
+        private String dependencyName = null;
+        private Map tagInfoMap;
+
+        ConfigParser(Map tagInfoMap) {
+            this.tagInfoMap = tagInfoMap;
+        }
+
+        public void parse(InputSource in) throws ParserConfigurationException, SAXException, IOException {
+            SAXParser parser = parserFactory.newSAXParser();
+            parser.parse(in, this);
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws SAXException {
+            if (tagInfo != null) {
+                String value = new String(ch, start, length).trim();
+                if ( "fatal-tags".equals(dependencyName) ) {
+                    tagInfo.defineFatalTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineFatalTags(\"" + value + "\");");
+                    }
+                } else if ( "req-enclosing-tags".equals(dependencyName) ) {
+                    tagInfo.defineRequiredEnclosingTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineRequiredEnclosingTags(\"" + value + "\");");
+                    }
+                } else if ( "forbidden-tags".equals(dependencyName) ) {
+                    tagInfo.defineForbiddenTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineForbiddenTags(\"" + value + "\");");
+                    }
+                } else if ( "allowed-children-tags".equals(dependencyName) ) {
+                    tagInfo.defineAllowedChildrenTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineAllowedChildrenTags(\"" + value + "\");");
+                    }
+                } else if ( "higher-level-tags".equals(dependencyName) ) {
+                    tagInfo.defineHigherLevelTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineHigherLevelTags(\"" + value + "\");");
+                    }
+                } else if ( "close-before-copy-inside-tags".equals(dependencyName) ) {
+                    tagInfo.defineCloseBeforeCopyInsideTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineCloseBeforeCopyInsideTags(\"" + value + "\");");
+                    }
+                } else if ( "close-inside-copy-after-tags".equals(dependencyName) ) {
+                    tagInfo.defineCloseInsideCopyAfterTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineCloseInsideCopyAfterTags(\"" + value + "\");");
+                    }
+                } else if ( "close-before-tags".equals(dependencyName) ) {
+                    tagInfo.defineCloseBeforeTags(value);
+                    if (generateCode) {
+                        System.out.println("tagInfo.defineCloseBeforeTags(\"" + value + "\");");
+                    }
+                }
+            }
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+            if ( "tag".equals(qName) ) {
+                String name = attributes.getValue("name");
+                String content = attributes.getValue("content");
+                String section = attributes.getValue("section");
+                String deprecated = attributes.getValue("deprecated");
+                String unique = attributes.getValue("unique");
+                String ignorePermitted = attributes.getValue("ignore-permitted");
+                ContentType contentType = ContentType.toValue(content);
+                BelongsTo belongsTo = BelongsTo.toValue(section);
+                tagInfo = new TagInfo(name, contentType,
+                                      belongsTo,
+                                      deprecated != null && "true".equals(deprecated),
+                                      unique != null && "true".equals(unique),
+                                      ignorePermitted != null && "true".equals(ignorePermitted), CloseTag.required, Display.any );
+                if (generateCode) {
+                    String s = "tagInfo = new TagInfo(\"#1\", #2, #3, #4, #5, #6);";
+                    s = s.replaceAll("#1", name);
+                    s = s.replaceAll("#2", ContentType.class.getCanonicalName()+"."+contentType.name());
+                    s = s.replaceAll("#3", BelongsTo.class.getCanonicalName()+"."+belongsTo.name());
+                    s = s.replaceAll("#4", Boolean.toString(deprecated != null && "true".equals(deprecated)));
+                    s = s.replaceAll("#5", Boolean.toString(unique != null && "true".equals(unique)));
+                    s = s.replaceAll("#6", Boolean.toString(ignorePermitted != null && "true".equals(ignorePermitted)));
+                    System.out.println(s);
+                }
+            } else if ( !"tags".equals(qName) ) {
+                dependencyName = qName;
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) throws SAXException {
+            if ( "tag".equals(qName) ) {
+                if (tagInfo != null) {
+                    tagInfoMap.put(tagInfo.getName(), tagInfo);
+                    if (generateCode) {
+                        System.out.println("this.put(\"" + tagInfo.getName() + "\", tagInfo);\n");
+                    }
+                }
+                tagInfo = null;
+            } else if ( !"tags".equals(qName) ) {
+                dependencyName = null;
+            }
+        }
+    }
+
+}
@@ -0,0 +1,72 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * <p>HTML text token.</p>
+ */
+public class ContentNode extends BaseHtmlNode implements HtmlNode {
+
+    protected final String content;
+    protected final boolean blank;
+
+    public ContentNode(String content) {
+        this.content = content;
+        this.blank = Utils.isEmptyString(this.content);
+    }
+
+    public String getContent() {
+        return content;
+    }
+
+    @Override
+    public String toString() {
+        return getContent();
+    }
+
+    public void serialize(Serializer serializer, Writer writer) throws IOException {
+    	writer.write( getContent() );
+    }
+
+    public boolean isBlank() {
+        return this.blank;
+    }
+}
@@ -0,0 +1,76 @@
+/*  
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+package org.htmlcleaner;
+
+/**
+ * @author patmoore
+ *
+ */
+public enum ContentType {
+    all("all"),
+    /**
+     * elements that have no children or content ( for example <img> ). For these elements, the check for null elements must be more than must a children/ content check.
+     */
+    none("none"),
+    text("text");
+    private final String dbCode;
+    private ContentType(String dbCode) {
+        this.dbCode =dbCode;
+    }
+    
+    /**
+     * @return the dbCode
+     */
+    public String getDbCode() {
+        return dbCode;
+    }
+    
+    public static ContentType toValue(Object value) {
+        ContentType result = null;
+        if ( value instanceof ContentType) {
+            result = (ContentType) value;
+        } else if ( value != null ) {
+            String dbCode = value.toString().trim(); 
+            for(ContentType contentType: ContentType.values()) {
+                if ( contentType.getDbCode().equalsIgnoreCase(dbCode) || contentType.name().equalsIgnoreCase(dbCode)) {
+                    result = contentType;
+                    break;
+                }
+            }
+        }
+        
+        return result;
+    }
+}
@@ -0,0 +1,645 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+
+/**
+ * This is the default tag provider for HTML Cleaner
+ * Note this is no longer generated from XML - see https://sourceforge.net/p/htmlcleaner/bugs/81/
+ */
+public class DefaultTagProvider implements ITagInfoProvider {
+
+    private static final String STRONG = "strong";
+    private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
+    // singleton instance, used if no other TagInfoProvider is specified
+    public final static DefaultTagProvider INSTANCE= new DefaultTagProvider();
+    
+    private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo,"+STRONG+",em,q,b,i,u,tt,sub,sup,big,small,strike,s,font";
+    private static final String CLOSE_BEFORE_TAGS = "h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
+    
+    /**
+     * Phrasing tags are those that can make up paragraphs along with text to make Phrasing Content
+     */
+    private static final String PHRASING_TAGS = "a,abbr,area,audio,b,bdi,bdo,br,button,canvas,cite,code,data,datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,keygen,label,link,map,mark,math,meta,meter,noscript,object,output,progress,q,ruby,s,samp,script,select,small,span,strong,sub,sup,svg,template,textarea,time,u,var,video,wbr";
+
+    /**
+     * HTML5 Media Tags
+     */
+    private static final String MEDIA_TAGS = "audio,video";
+    
+    public DefaultTagProvider() {
+        TagInfo tagInfo;
+        
+
+//        private static final Set<String> END_TAG_OPTIONAL = Collections.unmodifiableSet(new HashSet(Arrays.asList(
+//            "thead", "dt", "body", "tr", "colgroup", "td", "tfoot", "th", "li", "dd", "tbody", "p", "html", "head", "option")));
+//        private static final Set<String> END_TAG_FORBIDDEN = Collections.unmodifiableSet(new HashSet(Arrays.asList(
+//            "hr", "col", "param", "link", "img", "br", "meta", "input", "frame", "area", "basefont", "base", "isindex")));
+//        private static final Set<String> END_TAG_REQUIRED = Collections.unmodifiableSet(new HashSet(Arrays.asList(
+//            "noscript", "kbd", "center", "button", "h5", "h4", "samp", "ol", "h6", "h1", "h3", "h2", "form", "select",
+//            "font", "menu", "ins",
+//            "abbr", "label", "table", "code", "script", "cite", "iframe", "strong", "textarea", "noframes", "big",
+//            "small", "span", "sub", "optgroup", "bdo", "var", "div", "object", "sup", "title", "strike", "style",
+//            "dir", "map", "applet", "dl", "del", "fieldset", "ul", "b", "acronym", "a", "blockquote",
+//            "caption", "i", "u", "s", "frameset", "tt", "address", "q", "pre", "legend", "em", "dfn")));
+        tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("div", tagInfo);
+        
+        /**
+         * The HTML5 semantic flow tags
+         */
+        
+        // Sectioning tags
+        tagInfo = new TagInfo("aside", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("aside", tagInfo);
+        
+        tagInfo = new TagInfo("section", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("section", tagInfo);
+        
+        tagInfo = new TagInfo("article", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("article", tagInfo);
+        
+        tagInfo = new TagInfo("main", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("main", tagInfo);
+        
+        tagInfo = new TagInfo("nav", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("nav", tagInfo);
+        
+        tagInfo = new TagInfo("details", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("details", tagInfo);
+        tagInfo = new TagInfo("summary", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineRequiredEnclosingTags("details");
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("summary", tagInfo);
+        
+        tagInfo = new TagInfo("figure", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p");
+        this.put("figure", tagInfo);
+        tagInfo = new TagInfo("figcaption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        tagInfo.defineRequiredEnclosingTags("figure");
+        this.put("figcaption", tagInfo);
+        
+        // header and footer
+        tagInfo = new TagInfo("header", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,header,footer,main");
+        this.put("header", tagInfo);
+        
+        tagInfo = new TagInfo("footer", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,header,footer,main");
+        this.put("footer", tagInfo);
+        
+        /**
+         * Html5 phrasing tags
+         */
+        tagInfo = new TagInfo("mark", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        this.put("mark", tagInfo);
+
+        tagInfo = new TagInfo("bdi", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        this.put("bdi", tagInfo);
+        
+        tagInfo = new TagInfo("time", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        this.put("time", tagInfo);
+        
+        tagInfo = new TagInfo("meter", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        tagInfo.defineCloseBeforeTags("meter");
+        this.put("meter", tagInfo);
+        
+        
+        /**
+         * Html5 Ruby text
+         */
+        tagInfo = new TagInfo("ruby", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineAllowedChildrenTags("rt,rp");
+        this.put("ruby", tagInfo);
+        
+        tagInfo = new TagInfo("rt", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.inline);
+        //
+        // If we include this rule, we get an out-of-memory error. See issue 126.
+        //
+        //tagInfo.defineRequiredEnclosingTags("ruby");
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        this.put("rt", tagInfo);
+        
+        tagInfo = new TagInfo("rp", ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.inline);
+        //
+        // If we include this rule, we get an out-of-memory error. See issue 126.
+        //
+        //tagInfo.defineRequiredEnclosingTags("ruby");
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        this.put("rp", tagInfo);
+        
+        /**
+         * Html5 media tags
+         */
+        tagInfo = new TagInfo("audio", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
+        this.put("audio", tagInfo);
+        
+        tagInfo = new TagInfo("video", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
+        this.put("video", tagInfo);
+        
+        tagInfo = new TagInfo("source", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.any);
+        tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
+        this.put("source", tagInfo);
+        
+        tagInfo = new TagInfo("track", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.any);
+        tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
+        this.put("track", tagInfo);
+        
+        tagInfo = new TagInfo("canvas", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("canvas", tagInfo);
+        
+        /**
+         * Html5 interactive tags
+         */
+        tagInfo = new TagInfo("dialog", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("dialog", tagInfo);
+        
+        tagInfo = new TagInfo("progress", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        tagInfo.defineCloseBeforeTags("progress");
+        this.put("progress", tagInfo);
+        
+        /**
+         * HTML 4 and earlier tags
+         */
+
+        tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("span", tagInfo);
+
+        tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("meta", tagInfo);
+
+        tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("link", tagInfo);
+
+        tagInfo = new TagInfo("title",  ContentType.text, BelongsTo.HEAD, false, true, false, CloseTag.required, Display.none);
+        this.put("title", tagInfo);
+
+        tagInfo = new TagInfo("style",  ContentType.text, BelongsTo.HEAD, false, false, false, CloseTag.required, Display.none);
+        this.put("style", tagInfo);
+
+        tagInfo = new TagInfo("bgsound", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("bgsound", tagInfo);
+
+        tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+        this.put("h1", tagInfo);
+
+        tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+        this.put("h2", tagInfo);
+
+        tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+        this.put("h3", tagInfo);
+
+        tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+        this.put("h4", tagInfo);
+
+        tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+        this.put("h5", tagInfo);
+
+        tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+        this.put("h6", tagInfo);
+
+        // jericho parser requires <p></p>
+        tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("p", tagInfo);
+
+        tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put(STRONG, tagInfo);
+
+        tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("em", tagInfo);
+
+        tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("abbr", tagInfo);
+
+        tagInfo = new TagInfo("acronym", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("acronym", tagInfo);
+
+        tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("address", tagInfo);
+
+        tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("bdo", tagInfo);
+
+        tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("blockquote", tagInfo);
+
+        tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("cite", tagInfo);
+
+        tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("q", tagInfo);
+
+        tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("code", tagInfo);
+
+        tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("ins", tagInfo);
+
+        tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("del", tagInfo);
+
+        tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("dfn", tagInfo);
+
+        tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("kbd", tagInfo);
+
+        tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("pre", tagInfo);
+
+        tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("samp", tagInfo);
+
+        tagInfo = new TagInfo("listing", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("listing", tagInfo);
+
+        tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("var", tagInfo);
+
+        tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("br", tagInfo);
+
+        tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("wbr", tagInfo);
+
+        tagInfo = new TagInfo("nobr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseBeforeTags("nobr");
+        this.put("nobr", tagInfo);
+
+        tagInfo = new TagInfo("xmp",  ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("xmp", tagInfo);
+
+        tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseBeforeTags("a");
+        this.put("a", tagInfo);
+
+        tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("base", tagInfo);
+
+        tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
+        this.put("img", tagInfo);
+
+        tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+        tagInfo.defineFatalTags("map");
+        tagInfo.defineCloseBeforeTags("area");
+        this.put("area", tagInfo);
+
+        tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        tagInfo.defineCloseBeforeTags("map");
+        this.put("map", tagInfo);
+
+        tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("object", tagInfo);
+
+        tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("param", tagInfo);
+
+        tagInfo = new TagInfo("applet", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.any);
+        this.put("applet", tagInfo);
+
+        tagInfo = new TagInfo("xml", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
+        this.put("xml", tagInfo);
+
+        tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("ul", tagInfo);
+
+        tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("ol", tagInfo);
+
+        tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("li", tagInfo);
+
+        tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("dl", tagInfo);
+
+        tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineCloseBeforeTags("dt,dd");
+        this.put("dt", tagInfo);
+
+        tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineCloseBeforeTags("dt,dd");
+        this.put("dd", tagInfo);
+
+        tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("menu", tagInfo);
+
+        tagInfo = new TagInfo("dir", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("dir", tagInfo);
+
+        tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,colgroup,caption");
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("table", tagInfo);
+
+        tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineRequiredEnclosingTags("tbody");
+        tagInfo.defineAllowedChildrenTags("td,th");
+        tagInfo.defineHigherLevelTags("thead,tfoot");
+        tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
+        this.put("tr", tagInfo);
+
+        // jericho parser requires <td></td>
+        tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineRequiredEnclosingTags("tr");
+        tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
+        this.put("td", tagInfo);
+
+        tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineRequiredEnclosingTags("tr");
+        tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
+        this.put("th", tagInfo);
+
+        tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("tr,form");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("tbody", tagInfo);
+
+        tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("tr,form");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("thead", tagInfo);
+
+        tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("tr,form");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("tfoot", tagInfo);
+
+        tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
+        tagInfo.defineFatalTags("colgroup");
+        this.put("col", tagInfo);
+
+        tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("col");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("colgroup", tagInfo);
+
+        tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("caption", tagInfo);
+
+        tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.block);
+        tagInfo.defineForbiddenTags("form");
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("form", tagInfo);
+
+        tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
+        tagInfo.defineCloseBeforeTags("select,optgroup,option");
+        this.put("input", tagInfo);
+
+        tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseBeforeTags("select,optgroup,option");
+        this.put("textarea", tagInfo);
+
+        tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
+        tagInfo.defineAllowedChildrenTags("option,optgroup");
+        tagInfo.defineCloseBeforeTags("option,optgroup,select");
+        this.put("select", tagInfo);
+
+        tagInfo = new TagInfo("option",  ContentType.text, BelongsTo.BODY, false, false, true, CloseTag.optional, Display.inline);
+        tagInfo.defineFatalTags("select");
+        tagInfo.defineCloseBeforeTags("option");
+        this.put("option", tagInfo);
+
+        tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
+        tagInfo.defineFatalTags("select");
+        tagInfo.defineAllowedChildrenTags("option");
+        tagInfo.defineCloseBeforeTags("optgroup");
+        this.put("optgroup", tagInfo);
+
+        tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        tagInfo.defineCloseBeforeTags("select,optgroup,option");
+        this.put("button", tagInfo);
+
+        tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("label", tagInfo);
+        
+        tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        //
+        // If we include this rule, we get an out-of-memory error. See issue 129.
+        //
+        //tagInfo.defineRequiredEnclosingTags("fieldset");
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        this.put("legend", tagInfo);
+
+        tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("fieldset", tagInfo);
+
+        tagInfo = new TagInfo("isindex", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("isindex", tagInfo);
+
+        tagInfo = new TagInfo("script", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.none);
+        this.put("script", tagInfo);
+
+        tagInfo = new TagInfo("noscript", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.block);
+        this.put("noscript", tagInfo);
+
+        tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("u,i,tt,sub,sup,big,small,strike,blink,s");
+        this.put("b", tagInfo);
+
+        tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,tt,sub,sup,big,small,strike,blink,s");
+        this.put("i", tagInfo);
+
+        tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,i,tt,sub,sup,big,small,strike,blink,s");
+        this.put("u", tagInfo);
+
+        tagInfo = new TagInfo("tt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,big,small,strike,blink,s");
+        this.put("tt", tagInfo);
+
+        tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sup,big,small,strike,blink,s");
+        this.put("sub", tagInfo);
+
+        tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,big,small,strike,blink,s");
+        this.put("sup", tagInfo);
+
+        tagInfo = new TagInfo("big", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,small,strike,blink,s");
+        this.put("big", tagInfo);
+
+        tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,strike,blink,s");
+        this.put("small", tagInfo);
+
+        tagInfo = new TagInfo("strike", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,blink,s");
+        this.put("strike", tagInfo);
+
+        tagInfo = new TagInfo("blink", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,s");
+        this.put("blink", tagInfo);
+
+        tagInfo = new TagInfo("marquee", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("marquee", tagInfo);
+
+        tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,blink");
+        this.put("s", tagInfo);
+
+        tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("hr", tagInfo);
+
+        tagInfo = new TagInfo("font", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        this.put("font", tagInfo);
+
+        tagInfo = new TagInfo("basefont", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.none);
+        this.put("basefont", tagInfo);
+
+        tagInfo = new TagInfo("center", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("center", tagInfo);
+
+        tagInfo = new TagInfo("comment", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
+        this.put("comment", tagInfo);
+
+        tagInfo = new TagInfo("server", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
+        this.put("server", tagInfo);
+
+        tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("iframe", tagInfo);
+
+        tagInfo = new TagInfo("embed", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("embed", tagInfo);
+    }
+
+    /**
+     * @param key
+     * @param tagInfo
+     */
+    protected void put(String tagName, TagInfo tagInfo) {
+        this.tagInfoMap.put(tagName, tagInfo);
+    }
+
+    public TagInfo getTagInfo(String tagName) {
+        if ( tagName == null) {
+            // null named tagNode happens when a html fragment is being dealt with
+            return null;
+        } else {
+            return this.tagInfoMap.get(tagName);
+        }
+    }
+
+}
@@ -0,0 +1,62 @@
+package org.htmlcleaner;
+
+/**
+ * Most HTML 4 elements permitted within the BODY are classified as either
+ * block-level elements or inline elements. This enumeration contains
+ * corresponding constants to distinguish them.
+ * 
+ * @author Konstantin Burov (aectann@gmail.com)
+ * 
+ */
+public enum Display {
+	/**
+	 * Block-level elements typically contain inline elements and other
+	 * block-level elements. When rendered visually, block-level elements
+	 * usually begin on a new line.
+	 */
+	block(true, false),
+	/**
+	 * Inline elements typically may only contain text and other inline
+	 * elements. When rendered visually, inline elements do not usually begin on
+	 * a new line.
+	 */
+	inline(false, true),
+
+	/**
+	 * The following elements may be used as either block-level elements or
+	 * inline elements. If used as inline elements (e.g., within another inline
+	 * element or a P), these elements should not contain any block-level
+	 * elements.
+	 */
+	any(true, false),
+
+	/**
+	 * Elements that are not actually inline or block, usually such elements are
+	 * not rendered at all.
+	 */
+	none(true, false);
+
+    private boolean afterTagLineBreakNeeded;
+    private boolean leadingAndEndWhitespacesAllowed;
+
+    private Display(boolean afterTagLineBreakNeeded, boolean leadingAndEndWhitespacesAllowed) {
+        this.afterTagLineBreakNeeded = afterTagLineBreakNeeded;
+        this.leadingAndEndWhitespacesAllowed = leadingAndEndWhitespacesAllowed;
+    }
+	
+    /**
+     * @return true to advise serializers to put line break after tags with such a display type.
+     */
+    public boolean isAfterTagLineBreakNeeded() {
+        return afterTagLineBreakNeeded;
+    }
+
+    /**
+     * @return true if tag contents can have single leading or end whitespace
+     */
+    public boolean isLeadingAndEndWhitespacesAllowed() {
+        return leadingAndEndWhitespacesAllowed;
+    }
+    
+    
+}
@@ -0,0 +1,389 @@
+/*  Copyright (c) 2006-2013, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+
+/**
+ * <p>HTML doctype token.</p>
+ */
+public class DoctypeToken extends BaseHtmlNode implements HtmlNode{
+
+	//
+	// Part 1 is the document type, typically 'html' or 'HTML'
+	//
+    private String part1;
+    
+    //
+    // Part 2 is the PUBLIC or SYSTEM token
+    //
+    private String part2;
+    
+    //
+    // Part 3 is the PUBLIC identifier, typically '-//W3C//DTD HTML 4.01//EN' or similar
+    //
+    private String part3;
+    
+    //
+    // Part 4 is the SYSTEM identifier, typically a URL for the DTD
+    //
+    private String part4;
+    
+    /**
+     * The identified DocType, if any
+     */ 
+    private Integer type = null;
+    
+    
+    //
+    // Constants for identified doctypes
+    //
+    
+    public static final int UNKNOWN = 0;
+    public static final int HTML4_0 = 10;
+    public static final int HTML4_01 = 20;
+    public static final int HTML4_01_STRICT = 21;
+    public static final int HTML4_01_TRANSITIONAL = 22; 
+    public static final int HTML4_01_FRAMESET = 23; 
+    public static final int XHTML1_0_STRICT = 31;
+    public static final int XHTML1_0_TRANSITIONAL = 32;
+    public static final int XHTML1_0_FRAMESET = 33;
+    public static final int XHTML1_1 = 40;
+    public static final int XHTML1_1_BASIC = 41;
+    public static final int HTML5 = 60;
+    public static final int HTML5_LEGACY_TOOL_COMPATIBLE = 61;
+    
+    //
+    // Whether the DocType is valid
+    //
+    private Boolean valid = null;
+
+    public DoctypeToken(String part1, String part2, String part3, String part4) {
+        this.part1 = part1;
+        this.part2 = part2 != null ? part2.toUpperCase() : part2;
+        this.part3 = clean(part3);
+        this.part4 = clean(part4);
+        validate();
+    }
+    
+    /*
+     * Constructor for 5-part DocTypes, e.g. <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" SYSTEM "http://www.w3.org/TR/html4/strict.dtd">.
+     * For this we ignore part4 as we assume that must be "SYSTEM".
+     */
+    public DoctypeToken(String part1, String part2, String part3, String part4, String part5) {
+        this.part1 = part1;
+        this.part2 = part2 != null ? part2.toUpperCase() : part2;
+        this.part3 = clean(part3);
+        this.part4 = clean(part5);
+        validate();
+    }
+
+    private String clean(String s) {
+    	if (s != null) {
+	    	s = s.replace('>', ' ');
+	    	s = s.replace('<', ' ');
+	    	s = s.replace('&', ' ');
+	    	s = s.replace('\'', ' ');
+	    	s = s.replace('\"', ' ');
+    	}
+
+    	return s;
+    }
+    
+    public boolean isValid(){
+    	return valid;
+    }
+
+    /**
+     * Checks the doctype according to W3C parsing rules and tries to identify
+     * the type and validity
+     * 
+     * See: 
+     *  <ul>
+     *    <li>http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax</li>
+     *    <li>http://dev.w3.org/html5/html-author/#doctype-declaration</li>
+     *  </ul>
+     */
+    private void validate() {
+
+    	//
+    	// No PUBLIC or SYSTEM token
+    	//
+    	if (!"public".equalsIgnoreCase(part2) && !"system".equalsIgnoreCase(part2)) {
+
+    		//
+    		// HTML 5
+    		//
+    		if ("html".equalsIgnoreCase(part1) && (part2 == null)){
+    			type = HTML5;
+    			valid = true;
+    		} 
+    	}
+
+    	if ("public".equalsIgnoreCase(part2)){
+    		
+			//
+			// HTML 4.0 is valid without an ID, or with strict DTD ID
+			//
+    		if ("-//W3C//DTD HTML 4.0//EN".equals(getPublicId())){
+				type = HTML4_0;
+    			if ("http://www.w3.org/TR/REC-html40/strict.dtd".equals(part4) || "".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+
+			//
+			// HTML 4.0.1 STRICT is valid with Strict dtd ID or empty
+			//
+    		if ("-//W3C//DTD HTML 4.01//EN".equals(getPublicId())){
+				type = HTML4_01_STRICT;
+    			if ("http://www.w3.org/TR/html4/strict.dtd".equals(part4) || "".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+
+			//
+			// HTML 4.0.1 TRANSITIONAL valid only with Transitional DTD ID
+			//
+    		if ("-//W3C//DTD HTML 4.01 Transitional//EN".equals(getPublicId())){
+				type = HTML4_01_TRANSITIONAL;
+    			if ("http://www.w3.org/TR/html4/loose.dtd".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+
+			//
+			// HTML 4.0.1 FRAMESET valid only with Frameset ID
+			//
+    		if ("-//W3C//DTD HTML 4.01 Frameset//EN".equals(getPublicId())){
+				type = HTML4_01_FRAMESET;
+
+    			if ("http://www.w3.org/TR/html4/frameset.dtd".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+
+				
+			//
+			// XHTML 1.0
+			//
+    		if ("-//W3C//DTD XHTML 1.0 Strict//EN".equals(getPublicId())){
+  				type = XHTML1_0_STRICT;
+    			if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+
+    		}
+
+			//
+			// XHTML 1.0 Transitional
+			//
+    		if ("-//W3C//DTD XHTML 1.0 Transitional//EN".equals(getPublicId())){
+				type = XHTML1_0_TRANSITIONAL;
+
+    			if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+
+			//
+			// XHTML 1.0 Frameset
+			//
+    		if ("-//W3C//DTD XHTML 1.0 Frameset//EN".equals(getPublicId())){
+				type = XHTML1_0_FRAMESET;
+
+    			if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+    		
+			//
+			// XHTML 1.1
+			//
+    		if ("-//W3C//DTD XHTML 1.1//EN".equals(getPublicId())){
+				type = XHTML1_1;
+    			if ("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+
+			// 
+			// XHTML 1.1 Basic
+			//
+    		if ("-//W3C//DTD XHTML Basic 1.1//EN".equals(getPublicId())){
+				type = XHTML1_1_BASIC;
+
+    			if ("http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd".equals(getSystemId())){
+    				valid = true;
+    			} else {
+    				valid = false;
+    			}
+    		}
+    	}
+
+    	if ("system".equalsIgnoreCase(part2)){
+
+    		//
+    		// HTML 5 legacy tool compatible
+    		//
+    		if ("about:legacy-compat".equals(getPublicId())){
+    			type = HTML5_LEGACY_TOOL_COMPATIBLE;
+    			valid = true;
+    		}
+    	}
+
+    	if (type == null){
+    		type = UNKNOWN;
+    		valid = false;
+    	}
+    }
+
+    public String getContent() {
+    	
+    	if (type == UNKNOWN && part1 == null){
+    		return "<!DOCTYPE>";
+    	}
+    	
+    	String result = "<!DOCTYPE ";
+    		
+    	//
+    	// If the type is XHTML or HTML5, the output is "html", otherwise it should be "HTML"
+    	//
+    	if (type != UNKNOWN){
+    		if (type >= 30){
+    			result += "html";
+    		} else {
+    			result += "HTML";
+    		}
+    	} else {
+    		//
+    		// if its an unknown doctype, just pass through as-is.
+    		//
+    		result += part1 ;
+    	}
+    	
+
+    	if (part2 != null){
+    		result += " " + part2 + " \"" + part3 + "\"";
+    		
+    		if (!"".equals(part4) ) {
+    			result += " \"" + part4 + "\"";
+    		}
+    	}
+
+        result += ">";
+        return result;
+    }
+
+    @Override
+    public String toString() {
+        return getContent();
+    }
+    
+    /**
+     * This will retrieve an integer representing the identified DocType
+     */
+    public int getType(){
+    	return type;
+    }
+
+    public String getName() {
+    	return "";
+    }
+
+    public void serialize(Serializer serializer, Writer writer) throws IOException {
+    	writer.write(getContent() + "\n");
+    }
+    
+    /**
+     * This will retrieve the public ID of an externally referenced DTD, or an empty String if none is referenced.
+     */
+    public String getPublicId(){
+    	return part3;
+    }
+    
+    /**
+     * This will retrieve the system ID of an externally referenced DTD, or an empty String if none is referenced.
+     */
+    public String getSystemId(){
+    	return part4;
+    }
+    
+    public String getPart1() {
+        return part1;
+    }
+
+    public String getPart2() {
+        return part2;
+    }
+
+    /**
+     * Deprecated - use getPublicId() instead
+     * @return the third part of the DOCSTRING
+     */
+    @Deprecated
+    public String getPart3() {
+        return part3;
+    }
+
+    /**
+     * Deprecated - use getSystemId() instead
+     * @return the fourth part of the DOCSTRING
+     */
+    @Deprecated
+    public String getPart4() {
+        return part4;
+    }
+}
@@ -0,0 +1,275 @@
+package org.htmlcleaner;
+
+import java.util.Iterator;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.CDATASection;
+import org.w3c.dom.Comment;
+import org.w3c.dom.DOMImplementation;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentType;
+import org.w3c.dom.Element;
+
+public class DomBuilder implements XmlVisitor{
+	
+	private Document document;
+	private Element destinationElement;
+	private CleanerProperties props;
+	
+    protected boolean escapeXml = true;
+    protected boolean deserializeCdataEntities = false;
+    protected boolean strictErrorChecking = true;
+    
+    private static final String CSS_COMMENT_START = "/*";
+	
+	public DomBuilder(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
+		this.props = props;
+		this.escapeXml = escapeXml;
+		this.deserializeCdataEntities = deserializeCdataEntities;
+		this.strictErrorChecking = strictErrorChecking;
+	}
+	
+	public Document getDocument(){
+		return this.document;
+	}
+	
+	private boolean shouldEscapeOrTranslateEntities() {
+		return escapeXml || props.isRecognizeUnicodeChars() || props.isTranslateSpecialEntities();
+	}
+
+	public void head(HtmlNode node, int depth) {
+	
+    	//
+    	// For script and style nodes, check if we're set to use CDATA
+    	//
+    	CDATASection cdata = null;
+    	if (node instanceof TagNode && props.isUseCdataFor(((TagNode)node).getName())){
+    		cdata = document.createCDATASection("");
+			destinationElement.appendChild(document.createTextNode(CSS_COMMENT_START));
+			destinationElement.appendChild(cdata); 
+    	}
+    	
+		if (node instanceof CommentNode) {
+
+			CommentNode commentNode = (CommentNode) node;
+			Comment comment = document.createComment( commentNode.getContent() );
+			destinationElement.appendChild(comment);
+
+		} else if (node instanceof ContentNode) {
+
+			ContentNode contentNode = (ContentNode) node;
+			String content = contentNode.getContent();
+			boolean specialCase = props.isUseCdataFor(node.getParent().getName());
+
+			if (shouldEscapeOrTranslateEntities() && !specialCase) {
+				content = Utils.escapeXml(content, props, true);
+			}
+
+			if (specialCase && node instanceof CData){
+				//
+				// For CDATA sections we don't want to return the start and
+				// end tokens. See issue #106.
+				//
+				content = ((CData)node).getContentWithoutStartAndEndTokens();
+			}
+			
+			if (specialCase && deserializeCdataEntities){
+				content = this.deserializeCdataEntities(content);
+			}
+
+        	if (cdata != null){
+        		cdata.appendData(content);
+        	} else {
+				destinationElement.appendChild(document.createTextNode(content) ); 
+        	}
+
+
+		} else if (node instanceof TagNode) {
+			
+			TagNode subTagNode = (TagNode) node;
+			
+			//
+			// XML element names are more strict in their definition
+			// than  HTML tag identifiers.
+			// See https://www.w3.org/TR/xml/#NT-Name
+			// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+			//
+			String name = Utils.sanitizeXmlIdentifier(subTagNode.getName(), props.getInvalidXmlAttributeNamePrefix());
+			
+			//
+			// If the element name is completely invalid, treat it as text
+			//
+			if (name == null){
+				ContentNode contentNode = new ContentNode(subTagNode.getName() + subTagNode.getText().toString());
+				String content = contentNode.getContent();
+				content = Utils.escapeXml(content, props, true);
+				destinationElement.appendChild(document.createTextNode(content) ); 
+
+			} else {
+
+				if (document == null){
+					try {
+						document = this.createDocument(subTagNode);
+					} catch (ParserConfigurationException e) {
+						// TODO Auto-generated catch block
+						e.printStackTrace();
+					}
+				}
+				
+				Element element = document.createElement( name );
+				
+				//
+				// Create attributes
+				//
+				Map<String, String> attributes =  subTagNode.getAttributes();
+				Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
+				while (entryIterator.hasNext()) {
+					Map.Entry<String, String> entry = entryIterator.next();
+					String attrName = entry.getKey();
+					String attrValue = entry.getValue();
+					if (escapeXml) {
+	        			attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
+						attrValue = Utils.escapeXml(attrValue, props, true);
+					}
+
+					//
+					// Fix any invalid attribute names by adding a prefix
+					//
+					if (!props.isAllowInvalidAttributeNames()){
+						attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
+					}
+
+					if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
+						element.setAttribute(attrName, attrValue);
+
+						//
+						// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
+						//
+						if (attrName.equalsIgnoreCase("id")) {
+							element.setIdAttribute(attrName, true);
+						}
+					}
+				}
+				if (destinationElement == null){
+					destinationElement = document.getDocumentElement();
+				} else {
+					destinationElement.appendChild(element);
+					destinationElement = element;
+				}
+				
+				//
+				// Hack for now, we need a better way to do this in future
+				//
+				for (Object token: subTagNode.getAllChildren()){
+					if (token instanceof ContentNode){
+						((ContentNode)token).setParent(subTagNode);
+					}
+				}
+
+			}
+		}
+
+	}
+	
+    protected String deserializeCdataEntities(String input){
+    	return Utils.deserializeEntities(input, props.isRecognizeUnicodeChars());
+    }
+
+	public void tail(HtmlNode node, int depth) {
+        if (node instanceof TagNode && destinationElement.getParentNode() instanceof Element) {
+            destinationElement = (Element) destinationElement.getParentNode();
+        }
+	}
+	
+    //
+    // Allow overriding of serialization for implementations. See bug #167.
+    //
+    protected Document createDocument(TagNode rootNode) throws ParserConfigurationException{
+
+        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        DocumentBuilder builder = factory.newDocumentBuilder();
+        DOMImplementation impl = builder.getDOMImplementation();
+        
+        Document document;
+        
+        //
+        // Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
+        //
+        // Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
+        // common patterns for errors with the older HTML4 doctypes.
+        //
+        if (rootNode.getDocType() != null){
+        	String qualifiedName = rootNode.getDocType().getPart1();
+        	String publicId = rootNode.getDocType().getPublicId();
+        	String systemId = rootNode.getDocType().getSystemId();
+        	
+        	//
+        	// If there is no qualified name, set it to html. See bug #153.
+        	//
+        	if (qualifiedName == null) qualifiedName = "html";
+        	
+            DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);
+            
+            //
+            // While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html". See bug #116
+            //
+            if (qualifiedName.equals("HTML")) qualifiedName = "html";
+            document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
+        } else {
+        	document = builder.newDocument();
+        	Element rootElement = document.createElement(rootNode.getName());
+        	document.appendChild(rootElement);
+        }
+        
+        //
+        // Turn off error checking if we're allowing invalid attribute names, or if we've chosen to turn it off
+        //
+        if (props.isAllowInvalidAttributeNames() || strictErrorChecking == false){
+        	document.setStrictErrorChecking(false);
+        }
+        
+        
+        //
+        // Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
+        //
+        Map<String, String> attributes =  rootNode.getAttributes();
+        Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
+        while (entryIterator.hasNext()) {
+            Map.Entry<String, String> entry = entryIterator.next();
+            String attrName = entry.getKey();
+            String attrValue = entry.getValue();
+            
+            //
+            // Fix any invalid attribute names
+            //
+            if (!props.isAllowInvalidAttributeNames()){
+            	attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
+            }
+        	
+        	if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
+
+        		if (escapeXml) {
+        			attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
+        			attrValue = Utils.escapeXml(attrValue, props, true);
+        		}
+
+        		document.getDocumentElement().setAttribute(attrName, attrValue);
+
+        		//
+        		// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
+        		//
+        		if (attrName.equalsIgnoreCase("id")) {
+        			document.getDocumentElement().setIdAttribute(attrName, true);
+        		}
+        	}
+
+        }
+        
+        return document;
+    }
+
+}
@@ -0,0 +1,410 @@
+/*  Copyright (c) 2006-2019, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+package org.htmlcleaner;
+
+import org.w3c.dom.CDATASection;
+import org.w3c.dom.Comment;
+import org.w3c.dom.DOMImplementation;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentType;
+import org.w3c.dom.Element;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * <p>DOM serializer - creates xml DOM.</p>
+ */
+public class DomSerializer {
+	
+    private static final String CSS_COMMENT_START = "/*";
+
+    private static final String CSS_COMMENT_END = "*/";
+
+    private static final String NEW_LINE = "\n";
+    
+    private static final String XML_10 = "1.0";
+    
+    private static final String XML_11 = "1.1";
+
+    /**
+     * The HTML Cleaner properties set by the user to control the HTML cleaning.
+     */
+    protected CleanerProperties props;
+    
+    /**
+     * Whether XML entities should be escaped or not.
+     */
+    protected boolean escapeXml = true;
+    
+    protected boolean deserializeCdataEntities = false;
+    
+    protected boolean strictErrorChecking = true;
+    
+    protected String xmlVersion = XML_10;
+    
+    public String getXmlVersion() {
+		return xmlVersion;
+	}
+
+	public void setXmlVersion(String xmlVersion) throws Exception {
+		if (xmlVersion == XML_10 || xmlVersion == XML_11) {
+			this.xmlVersion = xmlVersion;
+		} else {
+			throw new Exception("Invalid XML version - must be 1.0 or 1.1");
+		}
+	}
+
+	/**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     * @param escapeXml if true then escape XML entities
+     * @param deserializeCdataEntities if true then deserialize entities in CData sections
+     * @param strictErrorChecking if false then Document strict error checking is turned off
+     */
+    public DomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
+        this.props = props;
+        this.escapeXml = escapeXml;
+        this.deserializeCdataEntities = deserializeCdataEntities;
+        this.strictErrorChecking = strictErrorChecking;
+    }
+
+    /**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     * @param escapeXml if true then escape XML entities
+     * @param deserializeCdataEntities if true then deserialize entities in CData sections
+     */
+    public DomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities) {
+        this.props = props;
+        this.escapeXml = escapeXml;
+        this.deserializeCdataEntities = deserializeCdataEntities;
+    }
+
+    /**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     * @param escapeXml if true then escape XML entities
+     */
+    public DomSerializer(CleanerProperties props, boolean escapeXml) {
+        this.props = props;
+        this.escapeXml = escapeXml;
+    }
+
+    /**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     */
+    public DomSerializer(CleanerProperties props) {
+        this(props, true);
+    }
+    
+    
+    //
+    // Allow overriding of serialization for implementations. See bug #167.
+    //
+    protected Document createDocument(TagNode rootNode) throws ParserConfigurationException{
+
+        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+        DocumentBuilder builder = factory.newDocumentBuilder();
+        DOMImplementation impl = builder.getDOMImplementation();
+        
+        Document document;
+        
+        //
+        // Where a DOCTYPE is supplied in the input, ensure that this is in the output DOM. See issue #27
+        //
+        // Note that we may want to fix incorrect DOCTYPEs in future; there are some fairly
+        // common patterns for errors with the older HTML4 doctypes.
+        //
+        if (rootNode.getDocType() != null){
+        	String qualifiedName = rootNode.getDocType().getPart1();
+        	String publicId = rootNode.getDocType().getPublicId();
+        	String systemId = rootNode.getDocType().getSystemId();
+        	
+        	//
+        	// If there is no qualified name, set it to html. See bug #153.
+        	//
+        	if (qualifiedName == null) qualifiedName = "html";
+        	
+            DocumentType documentType = impl.createDocumentType(qualifiedName, publicId, systemId);
+            
+            //
+            // While the qualified name is "HTML" for some DocTypes, we want the actual document root name to be "html". See bug #116
+            //
+            if (qualifiedName.equals("HTML")) qualifiedName = "html";
+            document = impl.createDocument(rootNode.getNamespaceURIOnPath(""), qualifiedName, documentType);
+            document.setXmlVersion(xmlVersion);
+        } else {
+        	document = builder.newDocument();
+        	document.setXmlVersion(xmlVersion);
+        	Element rootElement = document.createElement(rootNode.getName());
+        	document.appendChild(rootElement);
+        }
+        
+        //
+        // Turn off error checking if we're allowing invalid attribute names, or if we've chosen to turn it off
+        //
+        if (props.isAllowInvalidAttributeNames() || strictErrorChecking == false){
+        	document.setStrictErrorChecking(false);
+        }
+        
+        
+        //
+        // Copy across root node attributes - see issue 127. Thanks to rasifiel for the patch
+        //
+        Map<String, String> attributes =  rootNode.getAttributes();
+        Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
+        while (entryIterator.hasNext()) {
+            Map.Entry<String, String> entry = entryIterator.next();
+            String attrName = entry.getKey();
+            String attrValue = entry.getValue();
+            
+            //
+            // Fix any invalid attribute names
+            //
+            if (!props.isAllowInvalidAttributeNames()){
+            	attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
+            }
+        	
+        	if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
+
+        		if (escapeXml) {
+        			attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
+        			attrValue = Utils.escapeXml(attrValue, props, true);
+        		}
+
+        		document.getDocumentElement().setAttribute(attrName, attrValue);
+
+        		//
+        		// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
+        		//
+        		if (attrName.equalsIgnoreCase("id")) {
+        			document.getDocumentElement().setIdAttribute(attrName, true);
+        		}
+        	}
+
+        }
+        
+        return document;
+    }
+
+    /**
+     * @param rootNode the HTML Cleaner root node to serialize
+     * @return the W3C Document object
+     * @throws ParserConfigurationException if there's an error during serialization
+     */
+    public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
+    	Document document = createDocument(rootNode);
+        createSubnodes(document, (Element)document.getDocumentElement(), rootNode.getAllChildren());
+
+        return document;
+    }
+
+    /**
+     * @param element the element to check
+     * @return true if the passed element is a script or style element
+     */
+    protected boolean isScriptOrStyle(Element element) {
+        String tagName = element.getNodeName();
+        return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
+    }
+    /**
+     * encapsulate content with <[CDATA[ ]]> for things like script and style elements
+     * @param element
+     * @return true if <[CDATA[ ]]> should be used.
+     */
+    protected boolean dontEscape(Element element) {
+        // make sure <script src=..></script> doesn't get turned into <script src=..><[CDATA[]]></script>
+        return props.isUseCdataFor(element.getNodeName()) && (!element.hasChildNodes() || element.getTextContent() == null || element.getTextContent().trim().length() == 0);
+    }
+    
+    protected String outputCData(CData cdata){
+    	return cdata.getContentWithoutStartAndEndTokens();
+    }
+    
+    protected String deserializeCdataEntities(String input){
+    	return Utils.deserializeEntities(input, props.isRecognizeUnicodeChars());
+    }
+    
+    /**
+     * Serialize a given HTML Cleaner node.
+     * 
+     * @param document the W3C Document to use for creating new DOM elements
+     * @param element the W3C element to which we'll add the subnodes to
+     * @param tagChildren the HTML Cleaner nodes to serialize for that node
+     */
+    protected void createSubnodes(Document document, Element element, List<? extends BaseToken> tagChildren) {
+
+    	if (tagChildren != null) {
+    		
+        	CDATASection cdata = null;
+        	
+        	//
+        	// For script and style nodes, check if we're set to use CDATA
+        	//
+        	if (props.isUseCdataFor(element.getTagName())){
+        		cdata = document.createCDATASection("");
+    			element.appendChild(document.createTextNode(CSS_COMMENT_START));
+        		element.appendChild(cdata); 
+        	}
+        	
+            Iterator<? extends BaseToken> it = tagChildren.iterator();
+            while (it.hasNext()) {
+            	
+                Object item = it.next();
+    			if (item instanceof CommentNode) {
+
+    				CommentNode commentNode = (CommentNode) item;
+    				Comment comment = document.createComment( commentNode.getContent() );
+    				element.appendChild(comment);
+
+    			} else if (item instanceof ContentNode) {
+
+    				ContentNode contentNode = (ContentNode) item;
+    				String content = contentNode.getContent();
+					boolean specialCase = props.isUseCdataFor(element.getTagName());
+					
+					if (props.isRecognizeUnicodeChars() && props.isTranslateSpecialEntities()) {
+						content = Utils.deserializeEntities(content, props.isRecognizeUnicodeChars());
+					}
+
+					if ((escapeXml || props.isTranslateSpecialEntities()) && !specialCase) {
+	        			content = Utils.escapeXml(content, props, true);
+					}
+
+    				if (specialCase && item instanceof CData){
+    					//
+    					// For CDATA sections we don't want to return the start and
+    					// end tokens. See issue #106.
+    					//
+    					content = ((CData)item).getContentWithoutStartAndEndTokens();
+    				}
+    				
+    				if (specialCase && deserializeCdataEntities){
+    					content = this.deserializeCdataEntities(content);
+    				}
+
+                	if (cdata != null){
+                		cdata.appendData(content);
+                	} else {
+    					element.appendChild(document.createTextNode(content) ); 
+                	}
+
+
+    			} else if (item instanceof TagNode) {
+    				
+    				TagNode subTagNode = (TagNode) item;
+    				
+    				//
+    				// XML element names are more strict in their definition
+    				// than  HTML tag identifiers.
+    				// See https://www.w3.org/TR/xml/#NT-Name
+    				// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+    				//
+    				String name = Utils.sanitizeXmlIdentifier(subTagNode.getName(), props.getInvalidXmlAttributeNamePrefix());
+    				
+    				//
+    				// If the element name is completely invalid, treat it as text
+    				//
+    				if (name == null){
+    					ContentNode contentNode = new ContentNode(subTagNode.getName() + subTagNode.getText().toString());
+        				String content = contentNode.getContent();
+    					content = Utils.escapeXml(content, props, true);
+    					element.appendChild(document.createTextNode(content) ); 
+
+    				} else {
+
+    					Element subelement = document.createElement( name );
+    					Map<String, String> attributes =  subTagNode.getAttributes();
+    					Iterator<Map.Entry<String, String>> entryIterator = attributes.entrySet().iterator();
+    					while (entryIterator.hasNext()) {
+    						Map.Entry<String, String> entry = entryIterator.next();
+    						String attrName = entry.getKey();
+    						String attrValue = entry.getValue();
+    						if (escapeXml) {
+    							attrValue = Utils.deserializeEntities(attrValue, true);
+    							attrValue = Utils.escapeXml(attrValue, props, true);
+    						}
+
+    						//
+    						// Fix any invalid attribute names by adding a prefix
+    						//
+    						if (!props.isAllowInvalidAttributeNames()){
+    							attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix());
+    						}
+
+    						if (attrName != null && (Utils.isValidXmlIdentifier(attrName) || props.isAllowInvalidAttributeNames())){
+    							subelement.setAttribute(attrName, attrValue);
+
+    							//
+    							// Flag the attribute as an ID attribute if appropriate. Thanks to Chris173
+    							//
+    							if (attrName.equalsIgnoreCase("id")) {
+    								subelement.setIdAttribute(attrName, true);
+    							}
+    						}
+
+    					}
+
+    					// recursively create subnodes
+    					createSubnodes(document, subelement, subTagNode.getAllChildren());
+
+    					element.appendChild(subelement);
+    				}
+    			} else if (item instanceof List) {
+    				List<? extends BaseToken> sublist = (List<? extends BaseToken>) item;
+    				createSubnodes(document, element, sublist);
+    			}
+
+    		}
+            if (cdata != null){
+
+        		if (!cdata.getData().startsWith(NEW_LINE)){
+        			cdata.setData(CSS_COMMENT_END + NEW_LINE + cdata.getData());
+        		} else {
+        			cdata.setData(CSS_COMMENT_END + cdata.getData());
+        		}
+        		if (!cdata.getData().endsWith(NEW_LINE)){
+
+        			cdata.appendData(NEW_LINE);
+        		}
+            	cdata.appendData(CSS_COMMENT_START); 
+    			element.appendChild(document.createTextNode(CSS_COMMENT_END));
+            }
+    	}
+    }
+
+}
@@ -0,0 +1,69 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.Writer;
+
+
+/**
+ * <p>HTML tag end token.</p>
+ */
+public class EndTagToken extends TagToken {
+
+    public EndTagToken() {
+    }
+
+    public EndTagToken(String name) {
+        super(name == null ? null : name);
+    }
+
+    @Override
+    void addAttribute(String attName, String attValue) {
+        // do nothing - simply ignore attributes in closing tag
+    }
+
+    public void serialize(Serializer serializer, Writer writer) {
+    	// do nothing - simply ignore serialization
+    }
+
+    @Override
+    public String toString() {
+        return "endtoken" + super.toString();
+    }
+
+}
@@ -0,0 +1,535 @@
+/*  Copyright (c) 2006-2015, Philokypros Ioulianou
+	All rights reserved.
+
+	Redistribution and use of this software in source and binary forms,
+	with or without modification, are permitted provided that the following
+	conditions are met:
+
+	* Redistributions of source code must retain the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer.
+
+	* Redistributions in binary form must reproduce the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer in the documentation and/or other
+	  materials provided with the distribution.
+
+	* The name of HtmlCleaner may not be used to endorse or promote
+	  products derived from this software without specific prior
+	  written permission.
+
+	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+	POSSIBILITY OF SUCH DAMAGE.
+
+	You can contact Philokypros Ioulianou by sending e-mail to
+	philokypro_s@hotmail.com. Please include the word "HtmlCleaner" in the
+	subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+
+public class Html4TagProvider implements ITagInfoProvider {
+
+	private static final String STRONG = "strong";
+    private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
+    // singleton instance, used if no other TagInfoProvider is specified
+    public final static Html4TagProvider INSTANCE= new Html4TagProvider();
+    
+    private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo,"+STRONG+",em,q,b,i,u,tt,sub,sup,big,small,strike,s,font";
+    private static final String CLOSE_BEFORE_TAGS = "p,details,summary,menuitem,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
+  
+    /**
+     * Phrasing tags are those that can make up paragraphs along with text to make Phrasing Content
+     */
+    private static final String PHRASING_TAGS = "a,abbr,area,b,bdi,bdo,br,button,canvas,cite,code,command,data,datalist,del,dfn,em,embed,i,iframe,img,input,ins,kbd,keygen,label,link,map,mark,math,meta,meter,noscript,object,output,progress,q,s,samp,script,select,small,span,strong,sub,sup,svg,template,text,textarea,time,u,var,wbr";
+    
+    public Html4TagProvider() {
+
+    	TagInfo tagInfo=null;
+    	basicElements(tagInfo);
+    	formattingElements(tagInfo);
+    	formElements(tagInfo);
+    	imgElements(tagInfo);
+    	listElements(tagInfo);
+    	linkElements(tagInfo);
+    	tableElements(tagInfo);
+    	styleElements(tagInfo);
+    	olderElements(tagInfo);
+    	scriptElements(tagInfo);
+    }
+    
+    public void basicElements(TagInfo tagInfo){
+    
+    	  tagInfo = new TagInfo("title",  ContentType.text, BelongsTo.HEAD, false, true, false, CloseTag.required, Display.none);
+          this.put("title", tagInfo);
+
+          tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+          this.put("h1", tagInfo);
+
+          tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+          this.put("h2", tagInfo);
+
+          tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+          this.put("h3", tagInfo);
+
+          tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+          this.put("h4", tagInfo);
+
+          tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+          this.put("h5", tagInfo);
+
+          tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+          this.put("h6", tagInfo);
+          
+          // jericho parser requires <p></p>
+          tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+          this.put("p", tagInfo);
+          
+          tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+          this.put("br", tagInfo);
+
+          tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+          this.put("hr", tagInfo);
+          
+          tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+          tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+          tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+          this.put("div", tagInfo);
+    }
+    
+    
+    public void formattingElements(TagInfo tagInfo){
+    
+    	tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("abbr", tagInfo);
+
+        tagInfo = new TagInfo("acronym", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("acronym", tagInfo);
+
+        tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("address", tagInfo);
+
+        tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("u,i,tt,sub,sup,big,small,strike,blink,s");
+        this.put("b", tagInfo);
+        
+        tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("bdo", tagInfo);
+
+        
+        tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("blockquote", tagInfo);
+
+        tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("cite", tagInfo);
+
+        tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("q", tagInfo);
+
+        tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("code", tagInfo);
+
+        tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("ins", tagInfo);
+
+
+        tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,tt,sub,sup,big,small,strike,blink,s");
+        this.put("i", tagInfo);
+
+        tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,i,tt,sub,sup,big,small,strike,blink,s");
+        this.put("u", tagInfo);
+
+        tagInfo = new TagInfo("tt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,big,small,strike,blink,s");
+        this.put("tt", tagInfo);
+
+        tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sup,big,small,strike,blink,s");
+        this.put("sub", tagInfo);
+
+        tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,big,small,strike,blink,s");
+        this.put("sup", tagInfo);
+
+        tagInfo = new TagInfo("big", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,small,strike,blink,s");
+        this.put("big", tagInfo);
+
+        tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,strike,blink,s");
+        this.put("small", tagInfo);
+
+        tagInfo = new TagInfo("strike", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,blink,s");
+        this.put("strike", tagInfo);
+
+        tagInfo = new TagInfo("blink", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,s");
+        this.put("blink", tagInfo);
+
+        tagInfo = new TagInfo("marquee", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("marquee", tagInfo);
+
+        tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseInsideCopyAfterTags("b,u,i,tt,sub,sup,big,small,strike,blink");
+        this.put("s", tagInfo);
+
+
+        tagInfo = new TagInfo("font", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.inline);
+        this.put("font", tagInfo);
+
+        tagInfo = new TagInfo("basefont", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.none);
+        this.put("basefont", tagInfo);
+
+        tagInfo = new TagInfo("center", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("center", tagInfo);
+
+        
+        tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("del", tagInfo);
+
+        tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("dfn", tagInfo);
+
+        tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("kbd", tagInfo);
+
+        tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("pre", tagInfo);
+
+        tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("samp", tagInfo);
+        
+        tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put(STRONG, tagInfo);
+
+        tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("em", tagInfo);
+
+        tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("var", tagInfo);
+
+        
+        tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("wbr", tagInfo);
+        
+        
+    }
+    
+    
+    public void formElements(TagInfo tagInfo){
+    	tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.block);
+        tagInfo.defineForbiddenTags("form");
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("form", tagInfo);
+
+        tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
+        tagInfo.defineCloseBeforeTags("select,optgroup,option");
+        this.put("input", tagInfo);
+
+        tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineCloseBeforeTags("select,optgroup,option");
+        this.put("textarea", tagInfo);
+
+        tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
+        tagInfo.defineAllowedChildrenTags("option,optgroup");
+        tagInfo.defineCloseBeforeTags("option,optgroup,select");
+        this.put("select", tagInfo);
+
+        tagInfo = new TagInfo("option",  ContentType.text, BelongsTo.BODY, false, false, true, CloseTag.optional, Display.inline);
+        tagInfo.defineFatalTags("select");
+        tagInfo.defineCloseBeforeTags("option");
+        this.put("option", tagInfo);
+
+        tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY, false, false, true, CloseTag.required, Display.inline);
+        tagInfo.defineFatalTags("select");
+        tagInfo.defineAllowedChildrenTags("option");
+        tagInfo.defineCloseBeforeTags("optgroup");
+        this.put("optgroup", tagInfo);
+
+        tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        tagInfo.defineCloseBeforeTags("select,optgroup,option");
+        this.put("button", tagInfo);
+
+        tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("label", tagInfo);
+        
+        tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        //
+        // If we include this rule, we get an out-of-memory error. See issue 129.
+        //
+        //tagInfo.defineRequiredEnclosingTags("fieldset");
+        tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+        this.put("legend", tagInfo);
+
+        tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("fieldset", tagInfo);
+    
+    }
+    
+    
+    public void listElements(TagInfo tagInfo){
+    	
+    	tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("ul", tagInfo);
+
+        tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("ol", tagInfo);
+
+        tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("li,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("li", tagInfo);
+
+        tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("dl", tagInfo);
+
+        tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineCloseBeforeTags("dt,dd");
+        this.put("dt", tagInfo);
+
+        tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineCloseBeforeTags("dt,dd");
+        this.put("dd", tagInfo);
+
+        tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("menu", tagInfo);
+        
+        tagInfo = new TagInfo("dir", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.block);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("dir", tagInfo);
+    	
+    }
+    
+    
+    public void linkElements(TagInfo tagInfo){
+    	
+    	 tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+         this.put("link", tagInfo);
+         
+         tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+         tagInfo.defineCloseBeforeTags("a");
+         this.put("a", tagInfo);
+    }
+    	
+    
+    public void tableElements(TagInfo tagInfo){
+    	
+    	tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,colgroup,caption");
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("table", tagInfo);
+
+        tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineRequiredEnclosingTags("tbody");
+        tagInfo.defineAllowedChildrenTags("td,th");
+        tagInfo.defineHigherLevelTags("thead,tfoot");
+        tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
+        this.put("tr", tagInfo);
+
+        // jericho parser requires <td></td>
+        tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineRequiredEnclosingTags("tr");
+        tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
+        this.put("td", tagInfo);
+
+        tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineRequiredEnclosingTags("tr");
+        tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
+        this.put("th", tagInfo);
+
+        tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("tr,form");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("tbody", tagInfo);
+
+        tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("tr,form");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("thead", tagInfo);
+
+        tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("tr,form");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("tfoot", tagInfo);
+
+        tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.block);
+        tagInfo.defineFatalTags("colgroup");
+        this.put("col", tagInfo);
+
+        tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineAllowedChildrenTags("col");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("colgroup", tagInfo);
+        
+        tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        tagInfo.defineFatalTags("table");
+        tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+        this.put("caption", tagInfo);
+        
+    }
+    
+    
+    public void styleElements(TagInfo tagInfo){
+    	
+    	tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+        this.put("span", tagInfo);
+
+        tagInfo = new TagInfo("style",  ContentType.text, BelongsTo.HEAD, false, false, false, CloseTag.required, Display.none);
+        this.put("style", tagInfo);
+
+        tagInfo = new TagInfo("bgsound", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("bgsound", tagInfo);
+
+        tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("meta", tagInfo);
+
+        tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false, false, false, CloseTag.forbidden, Display.none);
+        this.put("base", tagInfo);
+    }
+    	
+    
+    public void scriptElements(TagInfo tagInfo){
+    	
+    	tagInfo = new TagInfo("script", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.none);
+        this.put("script", tagInfo);
+
+        tagInfo = new TagInfo("noscript", ContentType.all, BelongsTo.HEAD_AND_BODY, false, false, false, CloseTag.required, Display.block);
+        this.put("noscript", tagInfo);
+        
+        tagInfo = new TagInfo("applet", ContentType.all, BelongsTo.BODY, true, false, false, CloseTag.required, Display.any);
+        this.put("applet", tagInfo);
+        
+        tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+        this.put("object", tagInfo);
+
+        tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+        tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+        tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+        this.put("param", tagInfo);
+    }
+        
+       
+    public void imgElements(TagInfo tagInfo){
+    	  tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.inline);
+          this.put("img", tagInfo);
+
+          tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false, false, false, CloseTag.forbidden, Display.none);
+          tagInfo.defineFatalTags("map");
+          tagInfo.defineCloseBeforeTags("area");
+          this.put("area", tagInfo);
+
+          tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+          tagInfo.defineCloseBeforeTags("map");
+          this.put("map", tagInfo);
+
+    }
+
+      
+    	public void olderElements(TagInfo tagInfo){
+    		tagInfo = new TagInfo("listing", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+            tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+            tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+            this.put("listing", tagInfo);
+
+            tagInfo = new TagInfo("nobr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+            tagInfo.defineCloseBeforeTags("nobr");
+            this.put("nobr", tagInfo);
+
+            tagInfo = new TagInfo("xmp",  ContentType.text, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+            this.put("xmp", tagInfo);
+
+            tagInfo = new TagInfo("xml", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
+            this.put("xml", tagInfo);
+
+            tagInfo = new TagInfo("isindex", ContentType.none, BelongsTo.BODY, true, false, false, CloseTag.forbidden, Display.block);
+            tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+            tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+            this.put("isindex", tagInfo);
+
+            tagInfo = new TagInfo("comment", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
+            this.put("comment", tagInfo);
+
+            tagInfo = new TagInfo("server", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.none);
+            this.put("server", tagInfo);
+
+            tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.any);
+            this.put("iframe", tagInfo);
+    		
+    	}
+      
+    
+    protected void put(String tagName, TagInfo tagInfo) {
+        this.tagInfoMap.put(tagName, tagInfo);
+    }
+
+    public TagInfo getTagInfo(String tagName) {
+        if ( tagName == null) {
+            // null named tagNode happens when a html fragment is being dealt with
+            return null;
+        } else {
+            return this.tagInfoMap.get(tagName.toLowerCase());
+        }
+    }
+
+}
@@ -0,0 +1,885 @@
+/*  Copyright (c) 2006-2017, Philokypros Ioulianou and the HTMLCleaner team
+	All rights reserved.
+
+	Redistribution and use of this software in source and binary forms,
+	with or without modification, are permitted provided that the following
+	conditions are met:
+
+ * Redistributions of source code must retain the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer.
+
+ * Redistributions in binary form must reproduce the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer in the documentation and/or other
+	  materials provided with the distribution.
+
+ * The name of HtmlCleaner may not be used to endorse or promote
+	  products derived from this software without specific prior
+	  written permission.
+
+	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+	POSSIBILITY OF SUCH DAMAGE.
+
+	You can contact Philokypros Ioulianou by sending e-mail to
+	philokypro_s@hotmail.com. Please include the word "HtmlCleaner" in the
+	subject line.
+ */
+
+package org.htmlcleaner;
+
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+
+public class Html5TagProvider implements ITagInfoProvider {
+
+	private static final String STRONG = "strong";
+	private ConcurrentMap<String, TagInfo> tagInfoMap = new ConcurrentHashMap<String, TagInfo>();
+	// singleton instance, used if no other TagInfoProvider is specified
+	public final static Html5TagProvider INSTANCE = new Html5TagProvider();
+	public MathMLTagProvider INSTANCE2;
+
+	private static final String CLOSE_BEFORE_COPY_INSIDE_TAGS = "bdo," + STRONG
+			+ ",em,q,b,i,sub,sup,small,s";
+	private static final String CLOSE_BEFORE_TAGS = "p,summary,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
+
+	// private static final String CLOSE_BEFORE_TAGS =
+	// "h1,h2,h3,h4,h5,h6,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
+
+	/**
+	 * Phrasing tags are those that can make up paragraphs along with text to
+	 * make Phrasing Content. Generally speaking, phrasing content only allows phrasing content as child tags.
+	 */
+	private static final String PHRASING_TAGS = "a,abbr,area,audio,b,bdi,bdo,br,button,canvas,cite,code,command,datalist,del,dfn,em,i,input,ins,kbd,keygen,label,link,map,mark,meta,meter,noscript,output,progress,p,ruby,samp,s,script,select,small,span,strong,sub,sup,svg,template,textarea,time,u,var,wbr";
+
+	/**
+	 * Most elements that are used in the body of documents and applications are categorized as flow content.
+	 */
+	private static final String FLOW_TAGS = "a,abbr,address,area,article,aside,audio,b,bdi,bdo,blockquote,br,button,canvas,cite,code,data,datalist,del,dfn,div,dl,em,embed,fieldset,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hr,i,iframe,img,input,ins,kbd,keygen,label,main,map,mark,math,meter,nav,noscript,object,ol,output,p,pre,progress,q,ruby,s,samp,script,section,select,small,span,strong,sub,sup,svg,table,template,textarea,time,u,ul,var,video,wbr,text";
+	
+	/**
+	 * HTML5 Media Tags
+	 */
+	private static final String MEDIA_TAGS = "audio,video,object,source";
+	
+	private static final String SCRIPT_SUPPORTING_TAGS = "script,template";
+
+	public Html5TagProvider() {
+		TagInfo tagInfo = null;
+
+		embeddedContentTags(tagInfo);
+		semanticFlowTags(tagInfo);
+		interactiveTags(tagInfo);
+		groupingTags(tagInfo);
+		phrasingTags(tagInfo);
+		mediaTags(tagInfo);
+		editTags(tagInfo);
+		formTags(tagInfo);
+		tableTags(tagInfo);
+		metadataTags(tagInfo);
+		scriptingTags(tagInfo);
+		//INSTANCE2 = new MathMLTagProvider(tagInfo, tagInfoMap);
+	}
+	
+	public void embeddedContentTags(TagInfo tagInfo) {
+
+		// SVG
+		tagInfo = new TagInfo("svg", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineAllowedChildrenTags("animate,animateMotion,animateTransform,discard,set,desc,title,metadata,linearGradient,radialGradient,pattern,circle,ellipse,line,path,polygon,polyline,rect,defs,g,svg,symbol,use,a,audio,canvas,clipPath,filter,foreignObject,iframe,image,marker,mask,script,style,switch,text,video,view");
+		tagInfo.setAssumedNamespace("http://www.w3.org/2000/svg");
+		tagInfo.setAssumedNamespacePrefix("svg");
+		this.put("svg", tagInfo);
+			
+		// MathML
+		tagInfo = new TagInfo("math", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("math,summary,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+		//tagInfo.defineForbiddenTags("math");
+		//
+		// We'll add this later - right now it causes more problems than it solves
+		// as there are no tag name clashes between MathML and HTML unlike in SVG.
+		//
+		tagInfo.setAssumedNamespace("http://www.w3.org/1998/Math/MathML");
+		tagInfo.setAssumedNamespacePrefix("mathml");
+		//
+		this.put("math", tagInfo);
+	}
+
+	/**
+	 * The HTML5 semantic flow tags-Sectioning tags (15 total)
+	 * 
+	 */
+	public void semanticFlowTags(TagInfo tagInfo) {
+
+		tagInfo = new TagInfo("section", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("section", tagInfo);
+
+		tagInfo = new TagInfo("nav", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("nav", tagInfo);
+
+		tagInfo = new TagInfo("article", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineForbiddenTags("menu");
+		this.put("article", tagInfo);
+
+		tagInfo = new TagInfo("aside", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineForbiddenTags("menu");
+		tagInfo.defineForbiddenTags("address");
+		this.put("aside", tagInfo);
+
+		tagInfo = new TagInfo("h1", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
+		this.put("h1", tagInfo);
+
+		tagInfo = new TagInfo("h2", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
+		this.put("h2", tagInfo);
+
+		tagInfo = new TagInfo("h3", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
+		this.put("h3", tagInfo);
+
+		tagInfo = new TagInfo("h4", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
+		this.put("h4", tagInfo);
+
+		tagInfo = new TagInfo("h5", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
+		this.put("h5", tagInfo);
+
+		tagInfo = new TagInfo("h6", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS+",h1,h2,h3,h4,h5,h6");
+		this.put("h6", tagInfo);
+
+		tagInfo = new TagInfo("hgroup", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineAllowedChildrenTags("h1,h2,h3,h4,h5,h6");
+		this.put("hgroup", tagInfo);
+
+		// header and footer
+		tagInfo = new TagInfo("header", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineForbiddenTags("menu,header,footer");
+		this.put("header", tagInfo);
+
+		tagInfo = new TagInfo("footer", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineForbiddenTags("menu,header,footer");
+		this.put("footer", tagInfo);
+
+		tagInfo = new TagInfo("main", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("main", tagInfo);
+
+		tagInfo = new TagInfo("address", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineForbiddenTags("address");
+		this.put("address", tagInfo);
+	}
+
+	/**
+	 * The HTML5 Interactive tags (4 total)
+	 */
+	public void interactiveTags(TagInfo tagInfo) {
+
+		tagInfo = new TagInfo("details", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("details", tagInfo);
+
+		tagInfo = new TagInfo("summary", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineRequiredEnclosingTags("details");
+		tagInfo.defineForbiddenTags("summary");
+		this.put("summary", tagInfo);
+
+		tagInfo = new TagInfo("command", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineForbiddenTags("command");
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("command", tagInfo);
+
+		tagInfo = new TagInfo("menu", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineAllowedChildrenTags("menuitem,li");
+		this.put("menu", tagInfo);
+
+		tagInfo = new TagInfo("menuitem", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineRequiredEnclosingTags("menu");
+		this.put("menuitem", tagInfo);
+
+		tagInfo = new TagInfo("dialog", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("dialog", tagInfo);
+
+	}
+
+	/**
+	 * The HTML5 grouping tags (14 total)
+	 */
+
+	public void groupingTags(TagInfo tagInfo) {
+
+		tagInfo = new TagInfo("div", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("div", tagInfo);
+
+		tagInfo = new TagInfo("figure", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("figure", tagInfo);
+
+		tagInfo = new TagInfo("figcaption", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.any);
+		tagInfo.defineRequiredEnclosingTags("figure");
+		this.put("figcaption", tagInfo);
+
+		tagInfo = new TagInfo("p", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("p,address,summary,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml,time");
+		this.put("p", tagInfo);
+
+		tagInfo = new TagInfo("pre", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("pre", tagInfo);
+
+		tagInfo = new TagInfo("ul", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("dl,"+CLOSE_BEFORE_TAGS);
+		//
+		// This is not correct, but is how most browsers seem to handle
+		// lists. Strictly, only an LI can be a child of a UL or OL
+		//
+		tagInfo.defineAllowedChildrenTags("li,ul,ol,div");
+		//
+		// Where we do have invalid children, we try to insert a LI to make it valid
+		// rather than move out the content.
+		//
+		tagInfo.setPreferredChildTag("li");
+		this.put("ul", tagInfo);
+
+		tagInfo = new TagInfo("ol", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("dl,"+CLOSE_BEFORE_TAGS);
+		//
+		// This is not correct, but is how most browsers seem to handle
+		// lists. Strictly, only an LI can be a child of a UL or OL
+		//
+		tagInfo.defineAllowedChildrenTags("li,ul,ol,div");
+		//
+		// Where we do have invalid children, we try to insert a LI to make it valid
+		// rather than move out the content.
+		//
+		tagInfo.setPreferredChildTag("li");
+		this.put("ol", tagInfo);
+
+		tagInfo = new TagInfo("li", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("li," + CLOSE_BEFORE_TAGS);
+		tagInfo.defineRequiredEnclosingTags("ol,menu,ul");
+		this.put("li", tagInfo);
+
+		tagInfo = new TagInfo("dl", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineAllowedChildrenTags("dt,dd,div,"+SCRIPT_SUPPORTING_TAGS);
+		tagInfo.setPreferredChildTag("div");
+		this.put("dl", tagInfo);
+
+		tagInfo = new TagInfo("dt", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineCloseBeforeTags("dt,dd");
+		tagInfo.defineAllowedChildrenTags(FLOW_TAGS);
+		tagInfo.defineRequiredEnclosingTags("dl");
+		this.put("dt", tagInfo);
+
+		tagInfo = new TagInfo("dd", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineCloseBeforeTags("dt,dd");
+		tagInfo.defineAllowedChildrenTags(FLOW_TAGS);
+		tagInfo.defineRequiredEnclosingTags("dl");
+		this.put("dd", tagInfo);
+
+		tagInfo = new TagInfo("hr", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("hr", tagInfo);
+
+		tagInfo = new TagInfo("blockquote", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("blockquote", tagInfo);
+	}
+
+	/**
+	 * Html5 phrasing tags --text level semantics (31 total) thelw data
+	 */
+	public void phrasingTags(TagInfo tagInfo) {
+
+		tagInfo = new TagInfo("em", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("em", tagInfo);
+
+		tagInfo = new TagInfo(STRONG, ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put(STRONG, tagInfo);
+
+		tagInfo = new TagInfo("small", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,blink,s");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("small", tagInfo);
+
+		tagInfo = new TagInfo("s", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,sup,small,blink");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("s", tagInfo);
+
+		tagInfo = new TagInfo("a", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseBeforeTags("a");
+		this.put("a", tagInfo);
+
+		tagInfo = new TagInfo("wbr", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.none);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("wbr", tagInfo);
+
+		tagInfo = new TagInfo("mark", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("mark", tagInfo);
+
+		tagInfo = new TagInfo("bdi", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("bdi", tagInfo);
+
+		tagInfo = new TagInfo("time", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("time", tagInfo);
+
+		tagInfo = new TagInfo("data", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("data", tagInfo);
+
+		tagInfo = new TagInfo("cite", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("cite", tagInfo);
+
+		tagInfo = new TagInfo("q", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("q", tagInfo);
+
+		tagInfo = new TagInfo("code", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("code", tagInfo);
+
+		tagInfo = new TagInfo("span", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);		
+		this.put("span", tagInfo);
+
+		tagInfo = new TagInfo("bdo", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("bdo", tagInfo);
+
+		tagInfo = new TagInfo("dfn", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("dfn", tagInfo);
+
+		tagInfo = new TagInfo("kbd", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("kbd", tagInfo);
+
+		tagInfo = new TagInfo("abbr", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("abbr", tagInfo);
+
+		tagInfo = new TagInfo("var", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("var", tagInfo);
+
+		tagInfo = new TagInfo("samp", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("samp", tagInfo);
+
+		tagInfo = new TagInfo("br", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.none);
+		this.put("br", tagInfo);
+
+		tagInfo = new TagInfo("sub", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sup,small,blink,s");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("sub", tagInfo);
+
+		tagInfo = new TagInfo("sup", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseInsideCopyAfterTags("b,u,i,sub,small,blink,s");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("sup", tagInfo);
+
+		tagInfo = new TagInfo("b", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseInsideCopyAfterTags("u,i,sub,sup,small,blink,s");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("b", tagInfo);
+
+		tagInfo = new TagInfo("i", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseInsideCopyAfterTags("b,u,sub,sup,small,blink,s");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("i", tagInfo);
+
+		tagInfo = new TagInfo("u", ContentType.all, BelongsTo.BODY, true,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseInsideCopyAfterTags("b,i,sub,sup,small,blink,s");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("u", tagInfo);
+
+		// ---->Html5 Ruby text (added rb,rtc)
+
+		tagInfo = new TagInfo("ruby", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags("rt,rp,rb,rtc");
+		this.put("ruby", tagInfo);
+
+		tagInfo = new TagInfo("rtc", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.inline);
+		tagInfo.defineRequiredEnclosingTags("ruby");
+		tagInfo.defineAllowedChildrenTags("rt,"+PHRASING_TAGS);
+		this.put("rtc", tagInfo);
+
+		tagInfo = new TagInfo("rb", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.inline);
+		tagInfo.defineRequiredEnclosingTags("ruby");
+		this.put("rb", tagInfo);
+
+		tagInfo = new TagInfo("rt", ContentType.text, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.inline);
+		tagInfo.defineRequiredEnclosingTags("ruby");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("rt", tagInfo);
+
+		tagInfo = new TagInfo("rp", ContentType.text, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.inline);
+		tagInfo.defineRequiredEnclosingTags("ruby");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("rp", tagInfo);
+	}
+
+	/**
+	 * Html5 media-embedded tags (12 tags)
+	 */
+	public void mediaTags(TagInfo tagInfo) {
+
+		tagInfo = new TagInfo("img", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.inline);
+		this.put("img", tagInfo);
+
+		tagInfo = new TagInfo("iframe", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		this.put("iframe", tagInfo);
+
+		tagInfo = new TagInfo("embed", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		this.put("embed", tagInfo);
+
+		tagInfo = new TagInfo("object", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		this.put("object", tagInfo);
+
+		tagInfo = new TagInfo("param", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.none);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+		tagInfo.defineRequiredEnclosingTags("object");
+		this.put("param", tagInfo);
+
+		tagInfo = new TagInfo("audio", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
+		this.put("audio", tagInfo);
+
+		tagInfo = new TagInfo("picture", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.any);
+		tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
+		this.put("picture", tagInfo);
+
+		tagInfo = new TagInfo("video", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		tagInfo.defineCloseInsideCopyAfterTags(MEDIA_TAGS);
+		this.put("video", tagInfo);
+
+		tagInfo = new TagInfo("source", ContentType.none, BelongsTo.BODY,
+				false, false, false, CloseTag.forbidden, Display.any);
+		tagInfo.defineRequiredEnclosingTags("audio,video,object");
+		this.put("source", tagInfo);
+
+		tagInfo = new TagInfo("track", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.any);
+		tagInfo.defineRequiredEnclosingTags(MEDIA_TAGS);
+		this.put("track", tagInfo);
+
+		tagInfo = new TagInfo("canvas", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		this.put("canvas", tagInfo);
+
+		tagInfo = new TagInfo("area", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.none);
+		tagInfo.defineFatalTags("map");
+		tagInfo.defineCloseBeforeTags("area");
+		this.put("area", tagInfo);
+
+		tagInfo = new TagInfo("map", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		tagInfo.defineCloseBeforeTags("map");
+		tagInfo.defineAllowedChildrenTags("area");
+		this.put("map", tagInfo);
+	}
+
+	/**
+	 * The HTML5 edits tags (2 total)
+	 */
+	public void editTags(TagInfo tagInfo) {
+		tagInfo = new TagInfo("ins", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		this.put("ins", tagInfo);
+
+		tagInfo = new TagInfo("del", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		this.put("del", tagInfo);
+	}
+
+	/**
+	 * The HTML5 table tags (12 total)
+	 */
+	public void tableTags(TagInfo tagInfo) {
+
+		tagInfo = new TagInfo("table", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineAllowedChildrenTags("tr,tbody,thead,tfoot,col,colgroup,caption");
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("tr,thead,tbody,tfoot,caption,colgroup,table,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+		this.put("table", tagInfo);
+
+		tagInfo = new TagInfo("tr", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineRequiredEnclosingTags("tbody");
+		tagInfo.defineAllowedChildrenTags("td,th");
+		//
+		// Where we do have invalid children, we try to insert a TD to make it valid
+		// rather than move out the content.
+		//
+		tagInfo.setPreferredChildTag("td");
+		tagInfo.defineHigherLevelTags("thead,tfoot");
+		tagInfo.defineCloseBeforeTags("tr,td,th,caption,colgroup");
+		this.put("tr", tagInfo);
+
+		// jericho parser requires <td></td>
+		tagInfo = new TagInfo("td", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineRequiredEnclosingTags("tr");
+		tagInfo.defineHigherLevelTags("tr");
+		tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
+		this.put("td", tagInfo);
+
+		tagInfo = new TagInfo("th", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineRequiredEnclosingTags("tr");
+		tagInfo.defineCloseBeforeTags("td,th,caption,colgroup");
+		this.put("th", tagInfo);
+
+		tagInfo = new TagInfo("tbody", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineAllowedChildrenTags("tr,form");
+		tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+		this.put("tbody", tagInfo);
+
+		tagInfo = new TagInfo("thead", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineAllowedChildrenTags("tr,form");
+		tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+		this.put("thead", tagInfo);
+
+		tagInfo = new TagInfo("tfoot", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.optional, Display.block);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineAllowedChildrenTags("tr,form");
+		tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+		this.put("tfoot", tagInfo);
+
+		tagInfo = new TagInfo("col", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.block);
+		tagInfo.defineFatalTags("colgroup");
+		this.put("col", tagInfo);
+
+		tagInfo = new TagInfo("colgroup", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.optional, Display.block);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineAllowedChildrenTags("col");
+		tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+		this.put("colgroup", tagInfo);
+
+		tagInfo = new TagInfo("caption", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.inline);
+		tagInfo.defineFatalTags("table");
+		tagInfo.defineCloseBeforeTags("td,th,tr,tbody,thead,tfoot,caption,colgroup");
+		this.put("caption", tagInfo);
+
+	}
+
+	/**
+	 * The HTML5 forms tags (15 total)
+	 * 
+	 */
+	public void formTags(TagInfo tagInfo) {
+
+		tagInfo = new TagInfo("meter", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		tagInfo.defineCloseBeforeTags("meter");
+		this.put("meter", tagInfo);
+
+		tagInfo = new TagInfo("form", ContentType.all, BelongsTo.BODY, false,
+				false, true, CloseTag.required, Display.block);
+		tagInfo.defineForbiddenTags("form");
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("option,optgroup,textarea,select,fieldset,p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+		this.put("form", tagInfo);
+
+		tagInfo = new TagInfo("input", ContentType.none, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.inline);
+		tagInfo.defineCloseBeforeTags("select,optgroup,option");
+		this.put("input", tagInfo);
+
+		tagInfo = new TagInfo("textarea", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.inline);
+		tagInfo.defineCloseBeforeTags("select,optgroup,option");
+		this.put("textarea", tagInfo);
+
+		tagInfo = new TagInfo("select", ContentType.all, BelongsTo.BODY, false,
+				false, true, CloseTag.required, Display.inline);
+		tagInfo.defineAllowedChildrenTags("option,optgroup");
+		tagInfo.defineCloseBeforeTags("option,optgroup,select");
+		this.put("select", tagInfo);
+
+		tagInfo = new TagInfo("option", ContentType.text, BelongsTo.BODY,
+				false, false, true, CloseTag.optional, Display.inline);
+		tagInfo.defineFatalTags("select,datalist");
+		tagInfo.defineCloseBeforeTags("option");
+		this.put("option", tagInfo);
+
+		tagInfo = new TagInfo("optgroup", ContentType.all, BelongsTo.BODY,
+				false, false, true, CloseTag.required, Display.inline);
+		tagInfo.defineFatalTags("select");
+		tagInfo.defineAllowedChildrenTags("option");
+		tagInfo.defineCloseBeforeTags("optgroup");
+		this.put("optgroup", tagInfo);
+
+		tagInfo = new TagInfo("button", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		tagInfo.defineCloseBeforeTags("select,optgroup,option");
+		this.put("button", tagInfo);
+
+		tagInfo = new TagInfo("label", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.inline);
+		this.put("label", tagInfo);
+
+		tagInfo = new TagInfo("legend", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.block);
+		tagInfo.defineRequiredEnclosingTags("fieldset");
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		this.put("legend", tagInfo);
+
+		tagInfo = new TagInfo("fieldset", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.block);
+		tagInfo.defineCloseBeforeCopyInsideTags(CLOSE_BEFORE_COPY_INSIDE_TAGS);
+		tagInfo.defineCloseBeforeTags("p,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml");
+		this.put("fieldset", tagInfo);
+
+		tagInfo = new TagInfo("progress", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.any);
+		tagInfo.defineAllowedChildrenTags(PHRASING_TAGS);
+		tagInfo.defineCloseBeforeTags("progress");
+		this.put("progress", tagInfo);
+
+		tagInfo = new TagInfo("datalist", ContentType.all, BelongsTo.BODY,
+				false, false, false, CloseTag.required, Display.any);
+		tagInfo.defineAllowedChildrenTags("option");
+		tagInfo.defineCloseBeforeTags("datalist");
+		this.put("datalist", tagInfo);
+
+		tagInfo = new TagInfo("keygen", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.forbidden, Display.any);
+		this.put("keygen", tagInfo);
+
+		tagInfo = new TagInfo("output", ContentType.all, BelongsTo.BODY, false,
+				false, false, CloseTag.required, Display.any);
+		tagInfo.defineCloseBeforeTags("output," + CLOSE_BEFORE_TAGS);
+		this.put("output", tagInfo);
+	}
+
+	/**
+	 * HTML5 Document metadata tags
+	 */
+	public void metadataTags(TagInfo tagInfo) {
+
+		// As of HTML5, meta can be used in <body> where it has a @name attribute
+		// TODO add attribute rules
+		tagInfo = new TagInfo("meta", ContentType.none, BelongsTo.HEAD_AND_BODY, false,
+				false, false, CloseTag.forbidden, Display.none);
+		this.put("meta", tagInfo);
+		// As of HTML5, link can be used in <body> where it has an  @itemprop attribute
+		// TODO add attribute rules
+		tagInfo = new TagInfo("link", ContentType.none, BelongsTo.HEAD_AND_BODY, false,
+				false, false, CloseTag.forbidden, Display.none);
+		this.put("link", tagInfo);
+
+		tagInfo = new TagInfo("title", ContentType.text, BelongsTo.HEAD, false,
+				true, false, CloseTag.required, Display.none);
+		this.put("title", tagInfo);
+
+		// Current specification: style can only be used in <head>
+		tagInfo = new TagInfo("style", ContentType.text, BelongsTo.HEAD, false,
+				false, false, CloseTag.required, Display.none);
+		this.put("style", tagInfo);
+
+		tagInfo = new TagInfo("base", ContentType.none, BelongsTo.HEAD, false,
+				false, false, CloseTag.forbidden, Display.none);
+		this.put("base", tagInfo);
+	}
+
+	/**
+	 * HTML5 scripting tags
+	 */
+	public void scriptingTags(TagInfo tagInfo) {
+		tagInfo = new TagInfo("script", ContentType.all,
+				BelongsTo.HEAD_AND_BODY, false, false, false,
+				CloseTag.required, Display.none);
+		this.put("script", tagInfo);
+
+		tagInfo = new TagInfo("noscript", ContentType.all,
+				BelongsTo.HEAD_AND_BODY, false, false, false,
+				CloseTag.required, Display.block);
+		this.put("noscript", tagInfo);
+	}
+
+	/**
+	 * It inserts the tag node into the tagInfoMap.
+	 * 
+	 * @param tagName
+	 *            The name of the tag
+	 * @param tagInfo
+	 *            The info about tag node
+	 */
+	protected void put(String tagName, TagInfo tagInfo) {
+		this.tagInfoMap.put(tagName, tagInfo);
+	}
+
+	/**
+	 * It returns the tag information.
+	 * 
+	 * @param tagName
+	 *            The name of the tag to return
+	 * @return TagInfo The information about tag node
+	 */
+	public TagInfo getTagInfo(String tagName) {
+		if (tagName == null) {
+			// null named tagNode happens when a html fragment is being dealt
+			// with
+			return null;
+		} else {
+			return this.tagInfoMap.get(tagName.toLowerCase());
+		}
+	}
+
+}
@@ -0,0 +1,62 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+
+/**
+ * <p>General HtmlCleaner runtime exception.</p>
+ */
+public class HtmlCleanerException extends RuntimeException {
+
+    public HtmlCleanerException() {
+        this("HtmlCleaner expression occureed!");
+    }
+
+    public HtmlCleanerException(Throwable cause) {
+        super(cause);
+    }
+
+    public HtmlCleanerException(String message) {
+        super(message);
+    }
+
+    public HtmlCleanerException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+}
@@ -0,0 +1,354 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+	All rights reserved.
+
+	Redistribution and use of this software in source and binary forms,
+	with or without modification, are permitted provided that the following
+	conditions are met:
+
+	* Redistributions of source code must retain the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer.
+
+	* Redistributions in binary form must reproduce the above
+	  copyright notice, this list of conditions and the
+	  following disclaimer in the documentation and/or other
+	  materials provided with the distribution.
+
+	* The name of HtmlCleaner may not be used to endorse or promote
+	  products derived from this software without specific prior
+	  written permission.
+
+	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+	POSSIBILITY OF SUCH DAMAGE.
+
+	You can contact Vladimir Nikic by sending e-mail to
+	nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+	subject line.
+*/
+
+package org.htmlcleaner;
+
+import org.apache.tools.ant.BuildException;
+import org.apache.tools.ant.Task;
+
+import java.net.URL;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.FileOutputStream;
+import java.io.ByteArrayOutputStream;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ * <p>Support for ANT.</p>
+ */
+public class HtmlCleanerForAnt extends Task {
+
+    private String text;
+    private String src;
+    private String dest;
+    private String incharset = CleanerProperties.DEFAULT_CHARSET;
+    private String outcharset = CleanerProperties.DEFAULT_CHARSET;
+    private String taginfofile = null;
+    private String outputtype = "simple";
+    private boolean advancedxmlescape = true;
+    private boolean usecdata = true;
+    private String usecdatafor = "script,style";
+    private boolean specialentities = true;
+    private boolean unicodechars = true;
+    private boolean omitunknowntags = false;
+    private boolean treatunknowntagsascontent = false;
+    private boolean omitdeprtags = false;
+    private boolean treatdeprtagsascontent = false;
+    private boolean omitcomments = false;
+    private boolean omitxmldecl = false;
+    private boolean omitdoctypedecl = true;
+    private boolean omithtmlenvelope = false;
+    private boolean useemptyelementtags = true;
+    private boolean allowmultiwordattributes = true;
+    private boolean allowhtmlinsideattributes = false;
+    private boolean ignoreqe = false;
+    private boolean namespacesaware = true;
+    private String hyphenreplacement = "=";
+    private String prunetags = "";
+    private String booleanatts = CleanerProperties.BOOL_ATT_SELF;
+    private String nodebyxpath = null;
+
+    private String transform = null;
+    
+    private boolean allowInvalidAttributeNames = false;
+    private String invalidAttributeNamePrefix = "";
+
+    public void setText(String text) {
+        this.text = text;
+    }
+
+    public void setSrc(String src) {
+        this.src = src;
+    }
+
+    public void setDest(String dest) {
+        this.dest = dest;
+    }
+
+    public void setIncharset(String incharset) {
+        this.incharset = incharset;
+    }
+
+    public void setOutcharset(String outcharset) {
+        this.outcharset = outcharset;
+    }
+
+    public void setTaginfofile(String taginfofile) {
+        this.taginfofile = taginfofile;
+    }
+
+    public void setOutputtype(String outputtype) {
+        this.outputtype = outputtype;
+    }
+
+    public void setAdvancedxmlescape(boolean advancedxmlescape) {
+        this.advancedxmlescape = advancedxmlescape;
+    }
+
+    public void setUsecdata(boolean usecdata) {
+        this.usecdata = usecdata;
+    }
+    
+    public void setUsecdatafor(String usecdatafor) {
+    	this.usecdatafor = usecdatafor;
+    }
+
+    public void setSpecialentities(boolean specialentities) {
+        this.specialentities = specialentities;
+    }
+
+    public void setUnicodechars(boolean unicodechars) {
+        this.unicodechars = unicodechars;
+    }
+
+    public void setOmitunknowntags(boolean omitunknowntags) {
+        this.omitunknowntags = omitunknowntags;
+    }
+
+    public void setTreatunknowntagsascontent(boolean treatunknowntagsascontent) {
+        this.treatunknowntagsascontent = treatunknowntagsascontent;
+    }
+
+    public void setOmitdeprtags(boolean omitdeprtags) {
+        this.omitdeprtags = omitdeprtags;
+    }
+
+
+    public void setTreatdeprtagsascontent(boolean treatdeprtagsascontent) {
+        this.treatdeprtagsascontent = treatdeprtagsascontent;
+    }
+
+    public void setOmitcomments(boolean omitcomments) {
+        this.omitcomments = omitcomments;
+    }
+
+    public void setOmitxmldecl(boolean omitxmldecl) {
+        this.omitxmldecl = omitxmldecl;
+    }
+
+    public void setOmitdoctypedecl(boolean omitdoctypedecl) {
+        this.omitdoctypedecl = omitdoctypedecl;
+    }
+
+    public void setOmithtmlenvelope(boolean omithtmlenvelope) {
+        this.omithtmlenvelope = omithtmlenvelope;
+    }
+
+    public void setUseemptyelementtags(boolean useemptyelementtags) {
+        this.useemptyelementtags = useemptyelementtags;
+    }
+
+    public void setAllowmultiwordattributes(boolean allowmultiwordattributes) {
+        this.allowmultiwordattributes = allowmultiwordattributes;
+    }
+
+    public void setAllowhtmlinsideattributes(boolean allowhtmlinsideattributes) {
+        this.allowhtmlinsideattributes = allowhtmlinsideattributes;
+    }
+
+    public void setIgnoreqe(boolean ignoreqe) {
+        this.ignoreqe = ignoreqe;
+    }
+
+    public void setNamespacesaware(boolean namespacesaware) {
+        this.namespacesaware = namespacesaware;
+    }
+
+    public void setHyphenreplacement(String hyphenreplacement) {
+        this.hyphenreplacement = hyphenreplacement;
+    }
+
+    public void setPrunetags(String prunetags) {
+        this.prunetags = prunetags;
+    }
+
+    public void setBooleanatts(String booleanatts) {
+        this.booleanatts = booleanatts;
+    }
+
+    public void setNodebyxpath(String nodebyxpath) {
+        this.nodebyxpath = nodebyxpath;
+    }
+
+    public void setTransform(String transform) {
+        this.transform = transform;
+    }
+
+    public void addText(String text) {
+        this.text = text;
+    }
+
+    /**
+     * Implementation of Ant task execution.
+     * @throws BuildException
+     */
+    @Override
+    public void execute() throws BuildException {
+        HtmlCleaner cleaner;
+
+        if ( this.taginfofile != null ) {
+            cleaner = new HtmlCleaner(new ConfigFileTagProvider(new File(this.taginfofile)));
+        } else {
+            cleaner = new HtmlCleaner();
+        }
+
+        if (text == null && src == null) {
+            throw new BuildException("Eather attribute 'src' or text body containing HTML must be specified!");
+        }
+
+        CleanerProperties props = cleaner.getProperties();
+
+        props.setAdvancedXmlEscape(this.advancedxmlescape);
+        props.setUseCdataFor(this.usecdatafor);
+        props.setUseCdataForScriptAndStyle(this.usecdata);
+        props.setTranslateSpecialEntities(this.specialentities);
+        props.setRecognizeUnicodeChars(this.unicodechars);
+        props.setOmitUnknownTags(this.omitunknowntags);
+        props.setTreatUnknownTagsAsContent(this.treatunknowntagsascontent);
+        props.setOmitDeprecatedTags(this.omitdeprtags);
+        props.setTreatDeprecatedTagsAsContent(this.treatdeprtagsascontent);
+        props.setOmitComments(this.omitcomments);
+        props.setOmitXmlDeclaration(this.omitxmldecl);
+        props.setOmitDoctypeDeclaration(this.omitdoctypedecl);
+        props.setOmitHtmlEnvelope(this.omithtmlenvelope);
+        props.setUseEmptyElementTags(this.useemptyelementtags);
+        props.setAllowMultiWordAttributes(this.allowmultiwordattributes);
+        props.setAllowHtmlInsideAttributes(this.allowhtmlinsideattributes);
+        props.setIgnoreQuestAndExclam(this.ignoreqe);
+        props.setNamespacesAware(this.namespacesaware);
+        props.setHyphenReplacementInComment(this.hyphenreplacement);
+        props.setPruneTags(this.prunetags);
+        props.setBooleanAttributeValues(this.booleanatts);
+        props.setAllowInvalidAttributeNames(this.allowInvalidAttributeNames);
+        props.setInvalidXmlAttributeNamePrefix(this.invalidAttributeNamePrefix);
+
+        // set cleaner transformation if specified in "transform" attribute
+        // format of attribute is expected to be <transkey1>[=<transvalue1>]|<transkey2>[=<transvalue2>...
+        // (separator is pipe character)
+        if ( !Utils.isEmptyString(transform) ) {
+            String[] transItems = Utils.tokenize(transform, "|");
+            Map transInfos = new TreeMap();
+            for (String item : transItems) {
+                int index = item.indexOf('=');
+                String key = index <= 0 ? item : item.substring(0, index);
+                String value = index <= 0 ? null : item.substring(index + 1);
+                transInfos.put(key, value);
+            }
+
+            cleaner.initCleanerTransformations(transInfos);
+        }
+
+        try {
+            TagNode node;
+            try {
+                if ( src != null && (src.startsWith("http://") || src.startsWith("https://")) ) {
+                    node = cleaner.clean(new URL(src), incharset);
+                } else if (src != null) {
+                    node = cleaner.clean(new File(src), incharset);
+                } else {
+                    node = cleaner.clean(text);
+                }
+            } catch (IOException e) {
+                throw new BuildException(e);
+            }
+
+            // if user specifies XPath expresssion to choose node for serialization, then
+            // try to evaluate XPath and look for first TagNode instance in the resulting array
+            if ( nodebyxpath != null ) {
+                final Object[] xpathResult = node.evaluateXPath(nodebyxpath);
+                for (Object element : xpathResult) {
+                    if ( element instanceof TagNode ) {
+                        node = (TagNode) element;
+                        break;
+                    }
+                }
+            }
+
+            OutputStream out;
+            
+            String antPropertyName = "";
+            
+            if ( dest == null || "".equals(dest.trim()) ) {
+                out = System.out;
+            } else if ( dest.startsWith("property:") ) {
+                	out = new ByteArrayOutputStream();
+                	antPropertyName = dest.substring(dest.indexOf(':') + 1);
+                	getProject().log("Setting property " + antPropertyName);
+            } else {
+                out = new FileOutputStream(dest);
+            }
+
+            if ( "compact".equals(outputtype) ) {
+                new CompactXmlSerializer(props).writeToStream(node, out, outcharset);
+            } else if ( "browser-compact".equals(outputtype) ) {
+                new BrowserCompactXmlSerializer(props).writeToStream(node, out, outcharset);
+            } else if ( "pretty".equals(outputtype) ) {
+                new PrettyXmlSerializer(props).writeToStream(node, out, outcharset);
+            } else {
+                new SimpleXmlSerializer(props).writeToStream(node, out, outcharset);
+            }
+            
+            if ( antPropertyName != null && antPropertyName.length() > 0 ) {
+            	getProject().setNewProperty(antPropertyName, out.toString());
+            }
+            
+        } catch (IOException e) {
+             throw new BuildException(e);
+        } catch (XPatherException e) {
+            throw new BuildException(e);
+        }
+    }
+
+	public boolean isAllowInvalidAttributeNames() {
+		return allowInvalidAttributeNames;
+	}
+
+	public void setAllowInvalidAttributeNames(boolean allowInvalidAttributeNames) {
+		this.allowInvalidAttributeNames = allowInvalidAttributeNames;
+	}
+
+	public String getInvalidAttributeNamePrefix() {
+		return invalidAttributeNamePrefix;
+	}
+
+	public void setInvalidAttributeNamePrefix(String invalidAttributeNamePrefix) {
+		this.invalidAttributeNamePrefix = invalidAttributeNamePrefix;
+	}
+
+}
@@ -0,0 +1,16 @@
+package org.htmlcleaner;
+
+import java.util.List;
+
+/**
+ * Marker interface denoting nodes of the document tree
+ */
+public interface HtmlNode extends BaseToken {
+	
+    public List<? extends BaseToken> getSiblings();
+    
+    public TagNode getParent();
+    
+    public void setParent(TagNode parent);
+    
+}
@@ -0,0 +1,141 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+    
+    Redistribution and use of this software in source and binary forms, 
+    with or without modification, are permitted provided that the following 
+    conditions are met:
+    
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+    
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+    
+    * The name of HtmlCleaner may not be used to endorse or promote 
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+    POSSIBILITY OF SUCH DAMAGE.
+    
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * <p>Abstract HTML serializer - contains common logic for descendants.</p>
+ */
+public abstract class HtmlSerializer extends Serializer {
+
+    protected HtmlSerializer(CleanerProperties props) {
+        super(props);
+    }
+
+
+    protected boolean isMinimizedTagSyntax(TagNode tagNode) {
+        final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
+        return tagInfo != null && !tagNode.hasChildren() && tagInfo.isEmptyTag();
+    }
+
+    protected boolean dontEscape(TagNode tagNode) {
+        return isScriptOrStyle(tagNode);
+    }
+    
+    protected String escapeText(String content) {
+        return Utils.escapeHtml(content, props);
+    }
+
+    protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
+        String tagName = tagNode.getName();
+
+        if (Utils.isEmptyString(tagName)) {
+            return;
+        }
+
+        boolean nsAware = props.isNamespacesAware();
+
+        if (!nsAware && Utils.getXmlNSPrefix(tagName) != null ) {
+            tagName = Utils.getXmlName(tagName);
+        }
+
+        writer.write("<" + tagName);
+        for (Map.Entry<String, String> entry: tagNode.getAttributes().entrySet()) {
+            String attName = entry.getKey();
+            String attValue = entry.getValue();
+            attValue = Utils.deserializeEntities(attValue, props.isRecognizeUnicodeChars());
+
+            //
+            // Note that because we implemented the WHATWG attribute identifier rules
+            // during the tokenize stage, we'll never have invalid attribute names at
+            // this point.
+            //
+            if (attName != null){
+
+            	if (!nsAware && Utils.getXmlNSPrefix(attName) != null ) {
+            		attName = Utils.getXmlName(attName);
+            	}
+            	if (!(nsAware && attName.equalsIgnoreCase("xmlns")))
+            		writer.write(" " + attName + "=\"" + escapeText(attValue) + "\"");
+            }
+        }
+
+        if (nsAware) {
+            Map<String, String> nsDeclarations = tagNode.getNamespaceDeclarations();
+            if (nsDeclarations != null) {
+                for (Map.Entry<String, String> entry: nsDeclarations.entrySet()) {
+                    String prefix = entry.getKey();
+                    String att = "xmlns";
+                    if (prefix.length() > 0) {
+                         att += ":" + prefix;
+                    }
+                    writer.write(" " + att + "=\"" + escapeText(entry.getValue()) + "\"");
+                }
+            }
+        }
+
+        if ( isMinimizedTagSyntax(tagNode) ) {
+            writer.write(" />");
+            if (newLine) {
+                writer.write("\n");
+            }
+        } else {
+            writer.write(">");
+        }
+    }
+
+    protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
+        String tagName = tagNode.getName();
+
+        if (Utils.isEmptyString(tagName)) {
+            return;
+        }
+
+        if (Utils.getXmlNSPrefix(tagName) != null && !props.isNamespacesAware()) {
+            tagName = Utils.getXmlName(tagName);
+        }
+
+        writer.write( "</" + tagName + ">" );
+        if (newLine) {
+            writer.write("\n");
+        }
+    }
+
+}
@@ -0,0 +1,52 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+/**
+ * <p>
+ * Provides set of TagInfo instances. The instance of this interface is used as a
+ * collection of tag definitions used in cleanup process. Implementing this interface
+ * desired behaviour of cleaner can be achived.<br/>
+ * In most cases implementation will be or contain a kind of Map.
+ * </p>
+ */
+public interface ITagInfoProvider {
+
+    public TagInfo getTagInfo(String tagName);
+
+}
@@ -0,0 +1,254 @@
+package org.htmlcleaner;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.jdom2.CDATA;
+import org.jdom2.Comment;
+import org.jdom2.DefaultJDOMFactory;
+import org.jdom2.Document;
+import org.jdom2.Element;
+import org.jdom2.Namespace;
+import org.jdom2.Text;
+
+/**
+ * <p>
+ * JDom serializer - creates xml JDom instance out of the TagNode.
+ * </p>
+ */
+public class JDomSerializer {
+	
+    private static final String CSS_COMMENT_START = "/*";
+
+    private static final String CSS_COMMENT_END = "*/";
+        
+    private static final String NEW_LINE = "\n";
+
+    private DefaultJDOMFactory factory;
+
+    protected CleanerProperties props;
+    protected boolean escapeXml = true;
+
+    public JDomSerializer(CleanerProperties props, boolean escapeXml) {
+        this.props = props;
+        this.escapeXml = escapeXml;
+    }
+
+    public JDomSerializer(CleanerProperties props) {
+        this(props, true);
+    }
+
+    public Document createJDom(TagNode rootNode) {
+        this.factory = new DefaultJDOMFactory();
+        
+        //
+        // If there is no actual root node then return nothing
+        //
+        if (rootNode.getName() == null) return null;
+        
+        Element rootElement = createElement(rootNode);
+        Document document = this.factory.document(rootElement);
+
+        setAttributes(rootNode, rootElement);
+
+        createSubnodes(rootElement, rootNode.getAllChildren());
+
+        return document;
+    }
+
+    private Element createElement(TagNode node) {
+        String name = node.getName();
+		//
+		// XML element names are more strict in their definition
+		// than  HTML tag identifiers.
+		// See https://www.w3.org/TR/xml/#NT-Name
+		// vs. https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+		//
+		name = Utils.sanitizeXmlIdentifier(name);
+		
+        boolean nsAware = props.isNamespacesAware();
+        String prefix = Utils.getXmlNSPrefix(name);
+        Map<String, String> nsDeclarations = node.getNamespaceDeclarations();
+        String nsURI = null;
+        if (prefix != null) {
+            name = Utils.getXmlName(name);
+            if (nsAware) {
+                if (nsDeclarations != null) {
+                    nsURI = nsDeclarations.get(prefix);
+                }
+                if (nsURI == null) {
+                    nsURI = node.getNamespaceURIOnPath(prefix);
+                }
+                if (nsURI == null) {
+                    nsURI = prefix;
+                }
+            }
+        } else {
+            if (nsAware) {
+                if (nsDeclarations != null) {
+                    nsURI = nsDeclarations.get("");
+                }
+                if (nsURI == null) {
+                    nsURI = node.getNamespaceURIOnPath(prefix);
+                }
+            }
+        }
+
+        Element element;
+        if (nsAware && nsURI != null) {
+            Namespace ns = prefix == null ? Namespace.getNamespace(nsURI) : Namespace.getNamespace(prefix, nsURI);
+            element = factory.element(name, ns);
+        } else {
+            element = factory.element(name);
+        }
+
+        if (nsAware) {
+            defineNamespaceDeclarations(node, element);
+        }
+        return element;
+    }
+
+    private void defineNamespaceDeclarations(TagNode node, Element element) {
+        Map<String, String> nsDeclarations = node.getNamespaceDeclarations();
+        if (nsDeclarations != null) {
+            for (Map.Entry<String, String> nsEntry : nsDeclarations.entrySet()) {
+                String nsPrefix = nsEntry.getKey();
+                String nsURI = nsEntry.getValue();
+                Namespace ns = nsPrefix == null || "".equals(nsPrefix) ? Namespace.getNamespace(nsURI) : Namespace
+                        .getNamespace(nsPrefix, nsURI);
+                element.addNamespaceDeclaration(ns);
+            }
+        }
+    }
+
+    private void setAttributes(TagNode node, Element element) {
+    	for (Map.Entry<String, String> entry : node.getAttributes().entrySet()) {
+    		String attrName = entry.getKey();
+    		String attrValue = entry.getValue();
+    		if (escapeXml) {
+    			attrValue = Utils.deserializeEntities(attrValue, props.isRecognizeUnicodeChars());
+    			attrValue = Utils.escapeXml(attrValue, props, true);
+    		}
+
+            //
+            // Fix any invalid attribute names
+            //
+            if (!props.isAllowInvalidAttributeNames()){
+            	attrName = Utils.sanitizeXmlIdentifier(attrName, props.getInvalidXmlAttributeNamePrefix(),"");
+            }
+
+            //
+            // Note that even if we did want to allow invalid attribute names, JDom won't allow it
+            //
+    		if (attrName != null && Utils.isValidXmlIdentifier(attrName)){
+    			String attPrefix = Utils.getXmlNSPrefix(attrName);
+    			Namespace ns = null;
+    			if (attPrefix != null) {
+    				attrName = Utils.getXmlName(attrName);
+    				if (props.isNamespacesAware()) {
+    					String nsURI = node.getNamespaceURIOnPath(attPrefix);
+    					if (nsURI == null) {
+    						nsURI = attPrefix;
+    					}
+    					if (!attPrefix.startsWith("xml")) {
+    						ns = Namespace.getNamespace(attPrefix, nsURI);
+    					}
+    				}
+    			}
+
+    			//
+    			// Don't manually add xmlns attributes as these should be 
+    			// handled automatically by JDOM through the namespace
+    			// mechanism
+    			//
+    			if (!attrName.equals("xmlns")){
+    				if (ns == null) {
+    					element.setAttribute(attrName, attrValue);
+    				} else {
+    					element.setAttribute(attrName, attrValue, ns);
+    				}
+    			}
+    		}
+    	}
+    }
+
+    private void createSubnodes(Element element, List<? extends BaseToken> tagChildren) {
+        if (tagChildren != null) {
+        	
+        	CDATA cdata = null;
+        	//
+        	// For script and style nodes, check if we're set to use CDATA
+        	//
+        	if (props.isUseCdataFor(element.getName())){
+        		cdata = factory.cdata("");
+    			element.addContent(factory.text(CSS_COMMENT_START));
+        		element.addContent(cdata); 
+        	}
+        	
+        	
+            Iterator<? extends BaseToken> it = tagChildren.iterator();
+            while (it.hasNext()) {
+            	
+                Object item = it.next();
+                
+                if (item instanceof CommentNode) {
+                    CommentNode commentNode = (CommentNode) item;
+                    Comment comment = factory.comment(commentNode.getContent().toString());
+                    element.addContent(comment);
+                    
+                } else if (item instanceof ContentNode) {
+                	String nodeName = element.getName();
+                	String content = item.toString();
+                	boolean specialCase = props.isUseCdataFor(nodeName);
+
+                	if (escapeXml && !specialCase) {
+                		content = Utils.escapeXml(content, props, true);
+                	}
+                	if (specialCase && item instanceof CData){
+                		//
+                		// For CDATA sections we don't want to return the start and
+                		// end tokens. See issue #106.
+                		//
+                		content = ((CData)item).getContentWithoutStartAndEndTokens();
+                	}
+                	if (cdata != null){
+                		cdata.append(content);
+                	} else {
+                		Text text = factory.text(content);
+                		element.addContent(text);
+                	}
+
+                } else if (item instanceof TagNode) {
+                    TagNode subTagNode = (TagNode) item;
+                    Element subelement = createElement(subTagNode);
+
+                    setAttributes(subTagNode, subelement);
+
+                    // recursively create subnodes
+                    createSubnodes(subelement, subTagNode.getAllChildren());
+
+                    element.addContent(subelement);
+                } else if (item instanceof List) {
+                    List sublist = (List) item;
+                    createSubnodes(element, sublist);
+                }
+                
+            }
+            if (cdata != null){
+        		if (!cdata.getText().startsWith(NEW_LINE)){
+        			cdata.setText(CSS_COMMENT_END + NEW_LINE + cdata.getText());
+        		} else {
+        			cdata.setText(CSS_COMMENT_END + cdata.getText());
+        		}
+        		if (!cdata.getText().endsWith(NEW_LINE)){
+
+        			cdata.append(NEW_LINE);
+        		}
+            	cdata.append(CSS_COMMENT_START); 
+    			element.addContent(factory.text(CSS_COMMENT_END));
+            }
+        }
+    }
+
+}
@@ -0,0 +1,185 @@
+package org.htmlcleaner;
+
+import java.util.concurrent.ConcurrentMap;
+
+/**It contains the MathML tags to use with Html5 tags
+ * 
+ * @author User
+ *
+ */
+public class MathMLTagProvider {
+    
+    private static final String CLOSE_BEFORE_TAGS = "menclose,mpadded,mphantom,mfenced,mstyle,merror,msqrt,mroot,maligngroup,malignmark,mlabeledtr,ms,mi,mo,mn,mfrac,mtext,mspace,mglyph,p,details,summary,menuitem,address,label,abbr,acronym,dfn,kbd,samp,var,cite,code,param,xml";
+    
+    public MathMLTagProvider(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap) {
+    	presentationMarkup(tagInfo,tagInfoMap);
+    }
+    
+   public void presentationMarkup(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
+	   tokenElements(tagInfo,tagInfoMap);
+	   layoutElements(tagInfo,tagInfoMap);
+	   scriptElements(tagInfo,tagInfoMap);
+	   tableElements(tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("maction", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("maction", tagInfo,tagInfoMap);
+       
+   }
+   
+   
+   public void tokenElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
+	   tagInfo = new TagInfo("mi", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mi", tagInfo,tagInfoMap);
+	
+	   tagInfo = new TagInfo("mn", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mn", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mo", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mo", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mtext", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mtext", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mspace", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mspace", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("ms", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("ms", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mglyph", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.optional, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mglyph", tagInfo,tagInfoMap);
+   }
+   
+   
+   public void layoutElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
+	   
+	   tagInfo = new TagInfo("mrow", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mrow", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mfrac", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mfrac", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("msqrt", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("msqrt", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mroot", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mroot", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mstyle", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mstyle", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("merror", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("merror", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mpadded", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mpadded", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mphantom", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mphantom", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mfenced", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mfenced", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("menclose", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("menclose", tagInfo,tagInfoMap);
+	   
+   }
+    
+    
+   public void scriptElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
+	   tagInfo = new TagInfo("msub", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("msub", tagInfo,tagInfoMap); 
+	   
+	   tagInfo = new TagInfo("msup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.inline);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("msup", tagInfo,tagInfoMap); 
+	   
+	   tagInfo = new TagInfo("msubsup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("msubsup", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("munder", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("munder", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mover", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mover", tagInfo,tagInfoMap); 
+	   
+	   tagInfo = new TagInfo("munderover", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("munderover", tagInfo,tagInfoMap); 
+	   
+	   tagInfo = new TagInfo("mmultiscripts", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("mmultiscripts", tagInfo,tagInfoMap); 
+	   
+   }
+   
+   public void tableElements(TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap){
+	   tagInfo = new TagInfo("mtable", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   tagInfo.defineAllowedChildrenTags("mtr,mtd,mo,mn,mlabeledtr");
+	   this.put("mtable", tagInfo,tagInfoMap); 
+	   
+	   tagInfo = new TagInfo("mlabeledtr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   tagInfo.defineRequiredEnclosingTags("mtable");
+	   tagInfo.defineFatalTags("mtable");
+	   this.put("mlabeledtr", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mtr", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   tagInfo.defineAllowedChildrenTags("mtd,mlabeledtr");
+	   //tagInfo.defineRequiredEnclosingTags("mtable");
+	   this.put("mtr", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("mtd", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   //tagInfo.defineRequiredEnclosingTags("mtr");
+	   //tagInfo.defineFatalTags("mtable");
+	   this.put("mtd", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("maligngroup", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("maligngroup", tagInfo,tagInfoMap);
+	   
+	   tagInfo = new TagInfo("malignmark", ContentType.all, BelongsTo.BODY, false, false, false, CloseTag.required, Display.block);
+	   tagInfo.defineCloseBeforeTags(CLOSE_BEFORE_TAGS);
+	   this.put("malignmark", tagInfo,tagInfoMap);
+	   
+   }
+   
+   
+    protected void put(String tagName, TagInfo tagInfo,ConcurrentMap<String, TagInfo> tagInfoMap) {
+        tagInfoMap.put(tagName, tagInfo);
+    }
+
+    public TagInfo getTagInfo(String tagName,ConcurrentMap<String, TagInfo> tagInfoMap) {
+        if ( tagName == null) {
+            return null;
+        } else {
+            return tagInfoMap.get(tagName);
+        }
+    }
+    
+}
@@ -0,0 +1,26 @@
+package org.htmlcleaner;
+
+
+/**
+ * Nesting State
+ * Wrapper for a current HtmlCleaner cleaning state, keeping together
+ * the set of open tags and breaks in the current state.
+ * @author scottw
+ */
+class NestingState {
+	
+	private OpenTags openTags;
+	private ChildBreaks childBreaks;
+	
+	public NestingState(OpenTags openTags, ChildBreaks childBreaks) {
+		this.openTags = openTags;
+		this.childBreaks = childBreaks;
+	}
+
+	public OpenTags getOpenTags() {
+		return this.openTags;
+	}
+	public ChildBreaks getChildBreaks() {
+		return this.childBreaks;
+	}
+}
@@ -0,0 +1,133 @@
+package org.htmlcleaner;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Set;
+
+/**
+ * Class that contains information and methods for managing list of open,
+ * but unhandled tags.
+ */
+class OpenTags {
+	/**
+	 * 
+	 */
+	private final HtmlCleaner htmlCleaner;
+
+	/**
+	 * @param htmlCleaner
+	 */
+	OpenTags(HtmlCleaner htmlCleaner) {
+		this.htmlCleaner = htmlCleaner;
+	}
+
+	List<TagPos> list = new ArrayList<TagPos>();
+	private TagPos last;
+	private Set<String> set = new HashSet<String>();
+
+	boolean isEmpty() {
+		return list.isEmpty();
+	}
+
+	void addTag(String tagName, TagInfo tagInfo, int position, CleanTimeValues cleanTimeValues) {
+		last = new TagPos(position, tagName, tagInfo, cleanTimeValues);
+		list.add(last);
+		set.add(tagName);
+	}
+
+	void removeTag(String tagName) {
+		ListIterator<TagPos> it = list.listIterator( list.size() );
+		while ( it.hasPrevious() ) {
+			if (Thread.currentThread().isInterrupted()) {
+				this.htmlCleaner.handleInterruption();
+				break;
+			}
+			TagPos currTagPos = it.previous();
+			if (tagName.equals(currTagPos.name)) {
+				it.remove();
+				break;
+			}
+		}
+
+		last =  list.isEmpty() ? null : (TagPos) list.get( list.size() - 1 );
+	}
+
+	TagPos findFirstTagPos() {
+		return list.isEmpty() ? null : (TagPos) list.get(0);
+	}
+
+	TagPos getLastTagPos() {
+		return last;
+	}
+
+	TagPos findTag(String tagName, CleanTimeValues cleanTimeValues) {
+		if (tagName != null) {
+			ListIterator<TagPos> it = list.listIterator(list.size());
+			String fatalTag = null;
+			TagInfo fatalInfo = this.htmlCleaner.getTagInfo(tagName, cleanTimeValues);
+
+			while (it.hasPrevious()) {
+				if (Thread.currentThread().isInterrupted()) {
+					this.htmlCleaner.handleInterruption();
+					return null;
+				}
+				TagPos currTagPos = it.previous();
+				if (tagName.equals(currTagPos.name)) {
+					return currTagPos;
+				} else if (fatalInfo != null && fatalInfo.isFatalTag(currTagPos.name)) {
+					// do not search past a fatal tag for this tag
+					return null;
+				}
+			}
+		}
+
+		return null;
+	}
+
+	boolean tagExists(String tagName, CleanTimeValues cleanTimeValues) {
+		TagPos tagPos = findTag(tagName, cleanTimeValues);
+		return tagPos != null;
+	}
+
+	TagPos findTagToPlaceRubbish() {
+		TagPos result = null, prev = null;
+
+		if ( !isEmpty() ) {
+			ListIterator<TagPos> it = list.listIterator( list.size() );
+			while ( it.hasPrevious() ) {
+				if (Thread.currentThread().isInterrupted()) {
+					this.htmlCleaner.handleInterruption();
+					return null;
+				}
+				result = it.previous();
+				if ( result.info == null || result.info.allowsAnything() ) {
+					if (prev != null) {
+						return prev;
+					}
+				}
+				prev = result;
+			}
+		}
+
+		return result;
+	}
+
+	boolean tagEncountered(String tagName) {
+		return set.contains(tagName);
+	}
+
+	/**
+	 * Checks if any of tags specified in the set are already open.
+	 * @param tags
+	 */
+	boolean someAlreadyOpen(Set<String> tags) {
+		for (TagPos curr : list) {
+			if ( tags.contains(curr.name) ) {
+				return true;
+			}
+		}
+		return false;
+	}
+}
@@ -0,0 +1,20 @@
+package org.htmlcleaner;
+
+/**
+ *
+ *
+ */
+public enum OptionalOutput {
+    /**
+     * Never outputed even if supplied in the source.
+     */
+    omit,
+    /**
+     * outputed ONLY if supplied in the source.
+     */
+    preserve,
+    /**
+     * Always outputed, if information is not supplied in the source a default is created.
+     */
+    alwaysOutput;
+}
@@ -0,0 +1,221 @@
+/*  Copyright (c) 2006-2013, HtmlCleaner project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * <p>Pretty HTML serializer - creates resulting HTML with indenting lines.</p>
+ */
+public class PrettyHtmlSerializer extends HtmlSerializer {
+
+	private static final String DEFAULT_INDENTATION_STRING = "\t";
+
+    private String indentString = DEFAULT_INDENTATION_STRING;
+    private List<String> indents = new ArrayList<String>();
+
+	public PrettyHtmlSerializer(CleanerProperties props) {
+		this(props, DEFAULT_INDENTATION_STRING);
+	}
+
+	public PrettyHtmlSerializer(CleanerProperties props, String indentString) {
+		super(props);
+        this.indentString = indentString;
+	}
+
+	protected void serialize(TagNode tagNode, Writer writer) throws IOException {
+		serializePrettyHtml(tagNode, writer, 0, false, true);
+	}
+
+	/**
+	 * @param level
+	 * @return Appropriate indentation for the specified depth.
+	 */
+    private synchronized String getIndent(int level) {
+        int size = indents.size();
+        if (size <= level) {
+            String prevIndent = size == 0 ? null : indents.get(size - 1);
+            for (int i = size; i <= level; i++) {
+                String currIndent = prevIndent == null ? "" : prevIndent + indentString;
+                indents.add(currIndent);
+                prevIndent = currIndent;
+            }
+        }
+
+        return indents.get(level);
+    }
+
+    private String getIndentedText(String content, int level) {
+        String indent = getIndent(level);
+        StringBuilder result = new StringBuilder( content.length() );
+        StringTokenizer tokenizer = new StringTokenizer(content, "\n\r");
+
+        while (tokenizer.hasMoreTokens()) {
+            String line = tokenizer.nextToken().trim();
+            if (!"".equals(line)) {
+                result.append(indent).append(line).append("\n");
+            }
+        }
+
+        return result.toString();
+    }
+
+    private String getSingleLineOfChildren(List<? extends BaseToken> children) {
+        StringBuilder result = new StringBuilder();
+        Iterator<? extends BaseToken> childrenIt = children.iterator();
+        boolean isFirst = true;
+
+        while (childrenIt.hasNext()) {
+            Object child = childrenIt.next();
+
+            if ( !(child instanceof ContentNode) ) {
+                return null;
+            } else {
+                String content = child.toString();
+
+                //
+                // Removed the trim function as this has the potential
+                // to cause issues with actual content without adding
+                // any value
+                //
+                
+                /*
+                // if first item trims it from left
+                if (isFirst) {
+                	content = Utils.ltrim(content);
+                }
+
+                // if last item trims it from right
+                if (!childrenIt.hasNext()) {
+                	content = Utils.rtrim(content);
+                }
+                */
+
+                if ( content.indexOf("\n") >= 0 || content.indexOf("\r") >= 0 ) {
+                    return null;
+                }
+                result.append(content);
+            }
+
+            isFirst = false;
+        }
+
+        return result.toString();
+    }
+
+    protected void serializePrettyHtml(TagNode tagNode, Writer writer, int level, boolean isPreserveWhitespaces, boolean isLastNewLine) throws IOException {
+        List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
+        String tagName = tagNode.getName();
+        boolean isHeadlessNode = Utils.isEmptyString(tagName);
+        String indent = isHeadlessNode ? "" : getIndent(level);
+
+        if (!isPreserveWhitespaces) {
+            if (!isLastNewLine) {
+                writer.write("\n");
+            }
+            writer.write(indent);
+        }
+        serializeOpenTag(tagNode, writer, true);
+
+        boolean preserveWhitespaces = isPreserveWhitespaces || "pre".equalsIgnoreCase(tagName);
+
+        boolean lastWasNewLine = false;
+
+        if ( !isMinimizedTagSyntax(tagNode) ) {
+            String singleLine = getSingleLineOfChildren(tagChildren);
+            boolean dontEscape = dontEscape(tagNode);
+            if (!preserveWhitespaces && singleLine != null) {
+                writer.write( !dontEscape(tagNode) ? escapeText(singleLine) : singleLine );
+            } else {
+                Iterator<? extends BaseToken> childIterator = tagChildren.iterator();
+                while (childIterator.hasNext()) {
+                    Object child = childIterator.next();
+                    if (child instanceof TagNode) {
+                        serializePrettyHtml((TagNode)child, writer, isHeadlessNode ? level : level + 1, preserveWhitespaces, lastWasNewLine);
+                        lastWasNewLine = false;
+                    } else if (child instanceof ContentNode) {
+                        String content = dontEscape ? child.toString() : escapeText(child.toString());
+                        if (content.length() > 0) {
+                            if (dontEscape || preserveWhitespaces) {
+                                writer.write(content);
+                            } else if (Character.isWhitespace(content.charAt(0))) {
+                                if (!lastWasNewLine) {
+                                    writer.write("\n");
+                                    lastWasNewLine = false;
+                                }
+                                if (content.trim().length() > 0) {
+                                    writer.write( getIndentedText(Utils.rtrim(content), isHeadlessNode ? level : level + 1) );
+                                } else {
+                                    lastWasNewLine = true;
+                                }
+                            } else {
+                                if (content.trim().length() > 0) {
+                                    writer.write(Utils.rtrim(content));
+                                }
+                                if (!childIterator.hasNext()) {
+                                    writer.write("\n");
+                                    lastWasNewLine = true;
+                                }
+                            }
+                        }
+                    } else if (child instanceof CommentNode) {
+
+                        if (!lastWasNewLine && !preserveWhitespaces) {
+                            writer.write("\n");
+                            lastWasNewLine = false;
+                        }
+                        CommentNode commentNode = (CommentNode) child;
+                        String content = commentNode.getCommentedContent();
+                        writer.write( dontEscape ? content : getIndentedText(content, isHeadlessNode ? level : level + 1) );
+                    }
+                }
+            }
+
+            if (singleLine == null && !preserveWhitespaces) {
+                if (!lastWasNewLine) {
+                    writer.write("\n");
+                }
+            	writer.write(indent);
+            }
+
+            serializeEndTag(tagNode, writer, false);
+        }
+    }
+
+}
@@ -0,0 +1,217 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.*;
+
+/**
+ * <p>Pretty XML serializer - creates resulting XML with indenting lines.</p>
+ */
+public class PrettyXmlSerializer extends XmlSerializer {
+
+	private static final String DEFAULT_INDENTATION_STRING = "\t";
+
+    private String indentString = DEFAULT_INDENTATION_STRING;
+    private List<String> indents = new ArrayList<String>();
+
+	public PrettyXmlSerializer(CleanerProperties props) {
+		this(props, DEFAULT_INDENTATION_STRING);
+	}
+
+	public PrettyXmlSerializer(CleanerProperties props, String indentString) {
+		super(props);
+        this.indentString = indentString;
+	}
+
+	@Override
+    protected void serialize(TagNode tagNode, Writer writer) throws IOException {
+		serializePrettyXml(tagNode, writer, 0);
+	}
+
+	/**
+	 * @param level
+	 * @return Appropriate indentation for the specified depth.
+	 */
+    private synchronized String getIndent(int level) {
+        int size = indents.size();
+        if (size <= level) {
+            String prevIndent = size == 0 ? null : indents.get(size - 1);
+            for (int i = size; i <= level; i++) {
+                String currIndent = prevIndent == null ? "" : prevIndent + indentString;
+                indents.add(currIndent);
+                prevIndent = currIndent;
+            }
+        }
+
+        return indents.get(level);
+    }
+
+    private String getIndentedText(String content, int level) {
+        String indent = getIndent(level);
+        StringBuilder result = new StringBuilder( content.length() );
+        StringTokenizer tokenizer = new StringTokenizer(content, "\n\r");
+
+        while (tokenizer.hasMoreTokens()) {
+            String line = tokenizer.nextToken().trim();
+            if (!"".equals(line)) {
+                result.append(indent).append(line).append("\n");
+            }
+        }
+
+        return result.toString();
+    }
+
+    private String getSingleLineOfChildren(List<? extends BaseToken> children) {
+        StringBuilder result = new StringBuilder();
+        Iterator<? extends BaseToken> childrenIt = children.iterator();
+        boolean isFirst = true;
+
+        while (childrenIt.hasNext()) {
+            Object child = childrenIt.next();
+
+            if ( !(child instanceof ContentNode) ) {
+                return null;
+            } else {
+                String content = child.toString();
+
+                // if first item trims it from left
+                if (isFirst) {
+                	content = ltrim(content);
+                }
+
+                // if last item trims it from right
+                if (!childrenIt.hasNext()) {
+                	content = rtrim(content);
+                }
+
+                if ( content.indexOf("\n") >= 0 || content.indexOf("\r") >= 0 ) {
+                    return null;
+                }
+                result.append(content);
+            }
+
+            isFirst = false;
+        }
+
+        return result.toString();
+    }
+
+    protected void serializePrettyXml(TagNode tagNode, Writer writer, int level) throws IOException {
+        List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
+        boolean isHeadlessNode = Utils.isEmptyString(tagNode.getName());
+        String indent = isHeadlessNode ? "" : getIndent(level);
+
+        writer.write(indent);
+        serializeOpenTag(tagNode, writer, true);
+
+        if ( !isMinimizedTagSyntax(tagNode) ) {
+            String singleLine = getSingleLineOfChildren(tagChildren);
+            boolean dontEscape = dontEscape(tagNode);
+            if (singleLine != null) {
+            	if ( !dontEscape(tagNode) ) {
+            		writer.write( escapeXml(singleLine) );
+            	} else {
+            		writer.write( singleLine.replaceAll("]]>", "]]&gt;") );
+            	}
+            } else {
+                if (!isHeadlessNode) {
+            	    writer.write("\n");
+                }
+                for (Object child: tagChildren) {
+                    if (child instanceof TagNode) {
+                        serializePrettyXml( (TagNode)child, writer, isHeadlessNode ? level : level + 1 );
+                    } else if (child instanceof CData){
+                    	serializeCData((CData)child, tagNode, writer);
+                    } else if (child instanceof ContentNode) {
+                        String content = dontEscape ? child.toString().replaceAll("]]>", "]]&gt;") : escapeXml(child.toString());
+                        writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) );
+                    } else if (child instanceof CommentNode) {
+                        CommentNode commentNode = (CommentNode) child;
+                        String content = commentNode.getCommentedContent();
+                        writer.write( getIndentedText(content, isHeadlessNode ? level : level + 1) );
+                    }
+                }
+            }
+
+            if (singleLine == null) {
+            	writer.write(indent);
+            }
+
+            serializeEndTag(tagNode, writer, true);
+        }
+    }
+    /**
+     * Trims specified string from left.
+     * @param s
+     */
+    private String ltrim(String s) {
+        if (s == null) {
+            return null;
+        }
+
+        int index = 0;
+        int len = s.length();
+
+        while ( index < len && Character.isWhitespace(s.charAt(index)) ) {
+            index++;
+        }
+
+        return (index >= len) ? "" : s.substring(index);
+    }
+
+    /**
+     * Trims specified string from right.
+     * @param s
+     */
+    private String rtrim(String s) {
+        if (s == null) {
+            return null;
+        }
+
+        int len = s.length();
+        int index = len;
+
+        while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) {
+            index--;
+        }
+
+        return (index <= 0) ? "" : s.substring(0, index);
+    }
+}
@@ -0,0 +1,49 @@
+package org.htmlcleaner;
+
+/**
+ * A {@link TagNode} that only really holds whitespace or comments - allows
+ * using {@link ContentNode} in places where a {@link TagNode} is expected.
+ * <p/>
+ * This class is currently just a short-lived intermediate artifact generated 
+ * from {@link HtmlCleaner} while cleaning an html file and descarded 
+ * before the results are returned.
+ * 
+ * @author andyhot
+ */
+class ProxyTagNode extends TagNode {
+	private ContentNode token;
+	private CommentNode comment;
+	private TagNode bodyNode;
+	
+	public ProxyTagNode(ContentNode token, TagNode bodyNode) {
+		super("");
+		this.token = token;
+		this.bodyNode = bodyNode;
+	}
+	
+	public ProxyTagNode(CommentNode comment, TagNode bodyNode) {
+		super("");
+		this.comment = comment;
+		this.bodyNode = bodyNode;
+	}	
+
+	@Override
+	public TagNode getParent() {
+		return null;
+	}
+	
+	@Override
+	public boolean removeFromTree() {
+		bodyNode.removeChild(getToken());
+		return true;
+	}	
+	
+	public BaseToken getToken() {
+		return token!=null ? token : comment;
+	}	
+	
+	public String getContent() {
+		return token!=null ? token.getContent() : comment.getContent();
+	}
+
+}
@@ -0,0 +1,273 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+	
+    Redistribution and use of this software in source and binary forms, 
+    with or without modification, are permitted provided that the following 
+    conditions are met:
+	
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+	
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+	
+    * The name of HtmlCleaner may not be used to endorse or promote 
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+    POSSIBILITY OF SUCH DAMAGE.
+	
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * <p>Basic abstract serializer - contains common logic for descendants (methods <code>writeXXX()</code>.</p>
+ */
+public abstract class Serializer {
+
+    /**
+     * Used to implement serialization with missing envelope - omiting open and close tags, just
+     * serialize children.
+     */
+    private class HeadlessTagNode extends TagNode {
+        private HeadlessTagNode(TagNode wrappedNode) {
+            super("");
+            getAttributes().putAll(wrappedNode.getAttributes());
+            addChildren(wrappedNode.getAllChildren());
+            setDocType(wrappedNode.getDocType());
+            Map<String, String> nsDecls = getNamespaceDeclarations();
+            if (nsDecls != null) {
+                Map<String, String> wrappedNSDecls = wrappedNode.getNamespaceDeclarations();
+                if (wrappedNSDecls != null) {
+                    nsDecls.putAll(wrappedNSDecls);
+                }
+            }
+
+        }
+    }
+
+	protected CleanerProperties props;
+
+	protected Serializer(CleanerProperties props) {
+		this.props = props;
+    }
+
+    /**
+     * Writes specified TagNode to the output stream, using specified charset and optionally omits node envelope
+     * (skips open and close tags of the node).
+     * @param tagNode Node to be written
+     * @param out Output stream
+     * @param charset Charset of the output
+     * @param omitEnvelope Tells whether to skip open and close tag of the node.
+     * @throws IOException
+     */
+    public void writeToStream(TagNode tagNode, OutputStream out, String charset, boolean omitEnvelope) throws IOException {
+         write( tagNode, new OutputStreamWriter(out, charset), charset, omitEnvelope );
+    }
+
+    /**
+     * Writes specified TagNode to the output stream, using specified charset.
+     * @param tagNode Node to be written
+     * @param out Output stream
+     * @param charset Charset of the output
+     * @throws IOException
+     */
+    public void writeToStream(TagNode tagNode, OutputStream out, String charset) throws IOException {
+         writeToStream(tagNode, out, charset, false);
+    }
+
+    /**
+     * Writes specified TagNode to the output stream, using system default charset and optionally omits node envelope
+     * (skips open and close tags of the node).
+     * @param tagNode Node to be written
+     * @param out Output stream
+     * @param omitEnvelope Tells whether to skip open and close tag of the node.
+     * @throws IOException
+     */
+    public void writeToStream(TagNode tagNode, OutputStream out, boolean omitEnvelope) throws IOException {
+         writeToStream( tagNode, out, props.getCharset(), omitEnvelope );
+    }
+
+    /**
+     * Writes specified TagNode to the output stream, using system default charset.
+     * @param tagNode Node to be written
+     * @param out Output stream
+     * @throws IOException
+     */
+    public void writeToStream(TagNode tagNode, OutputStream out) throws IOException {
+         writeToStream(tagNode, out, false);
+    }
+
+    /**
+     * Writes specified TagNode to the file, using specified charset and optionally omits node envelope
+     * (skips open and close tags of the node).
+     * @param tagNode Node to be written
+     * @param fileName Output file name
+     * @param charset Charset of the output
+     * @param omitEnvelope Tells whether to skip open and close tag of the node.
+     * @throws IOException
+     */
+    public void writeToFile(TagNode tagNode, String fileName, String charset, boolean omitEnvelope) throws IOException {
+        writeToStream(tagNode, new FileOutputStream(fileName), charset, omitEnvelope );
+    }
+
+    /**
+     * Writes specified TagNode to the file, using specified charset.
+     * @param tagNode Node to be written
+     * @param fileName Output file name
+     * @param charset Charset of the output
+     * @throws IOException
+     */
+    public void writeToFile(TagNode tagNode, String fileName, String charset) throws IOException {
+        writeToFile(tagNode, fileName, charset, false);
+    }
+
+    /**
+     * Writes specified TagNode to the file, using specified charset and optionally omits node envelope
+     * (skips open and close tags of the node).
+     * @param tagNode Node to be written
+     * @param fileName Output file name
+     * @param omitEnvelope Tells whether to skip open and close tag of the node.
+     * @throws IOException
+     */
+    public void writeToFile(TagNode tagNode, String fileName, boolean omitEnvelope) throws IOException {
+        writeToFile(tagNode,fileName, props.getCharset(), omitEnvelope);
+    }
+
+    /**
+     * Writes specified TagNode to the file, using system default charset.
+     * @param tagNode Node to be written
+     * @param fileName Output file name
+     * @throws IOException
+     */
+    public void writeToFile(TagNode tagNode, String fileName) throws IOException {
+        writeToFile(tagNode, fileName, false);
+    }
+
+    /**
+     * @param tagNode Node to serialize to string
+     * @param charset Charset of the output - stands in xml declaration part
+     * @param omitEnvelope Tells whether to skip open and close tag of the node.
+     * @return Output as string
+     */
+    public String getAsString(TagNode tagNode, String charset, boolean omitEnvelope) {
+        StringWriter writer = new StringWriter();
+        try {
+            write(tagNode, writer, charset, omitEnvelope);
+        } catch (IOException e) {
+            // not writing to the file system so any io errors should be really rare ( and bad)
+            throw new HtmlCleanerException(e);
+        }
+        return writer.getBuffer().toString();
+    }
+
+    /**
+     * @param tagNode Node to serialize to string
+     * @param charset Charset of the output - stands in xml declaration part
+     * @return Output as string
+     */
+    public String getAsString(TagNode tagNode, String charset) {
+        return getAsString(tagNode, charset, false);
+    }
+
+    /**
+     * @param tagNode Node to serialize to string
+     * @param omitEnvelope Tells whether to skip open and close tag of the node.
+     * @return Output as string
+     * @throws IOException
+     */
+    public String getAsString(TagNode tagNode, boolean omitEnvelope) {
+        return getAsString(tagNode, props.getCharset(), omitEnvelope);
+    }
+
+    /**
+     * @param tagNode Node to serialize to string
+     * @return Output as string
+     * @throws IOException
+     */
+    public String getAsString(TagNode tagNode) {
+        return getAsString(tagNode, false);
+    }
+
+    public String getAsString(String htmlContent) {
+        HtmlCleaner htmlCleaner = new HtmlCleaner(this.props);
+        TagNode tagNode = htmlCleaner.clean(htmlContent);
+        return getAsString(tagNode, props.getCharset());
+    }
+
+
+    /**
+     * Writes specified node using specified writer.
+     * @param tagNode Node to serialize.
+     * @param writer Writer instance
+     * @param charset Charset of the output
+     * @throws IOException
+     */
+    public void write(TagNode tagNode, Writer writer, String charset) throws IOException {
+        write(tagNode, writer, charset, false);
+    }
+
+    /**
+     * Writes specified node using specified writer.
+     * @param tagNode Node to serialize.
+     * @param writer Writer instance
+     * @param charset Charset of the output
+     * @param omitEnvelope Tells whether to skip open and close tag of the node.
+     * @throws IOException
+     */
+    public void write(TagNode tagNode, Writer writer, String charset, boolean omitEnvelope) throws IOException {
+        if (omitEnvelope) {
+            tagNode = new HeadlessTagNode(tagNode);
+        }
+        writer = new BufferedWriter(writer);
+        if ( !props.isOmitXmlDeclaration() ) {
+            String declaration = "<?xml version=\"1.0\"";
+            if (charset != null) {
+                declaration += " encoding=\"" + charset + "\"";
+            }
+            declaration += "?>";
+            writer.write(declaration + "\n");
+		}
+		
+		if ( !props.isOmitDoctypeDeclaration() ) {
+			DoctypeToken doctypeToken = tagNode.getDocType();
+			if ( doctypeToken != null ) {
+				doctypeToken.serialize(this, writer);
+			}
+		}
+		
+		serialize(tagNode, writer);
+
+        writer.flush();
+        writer.close();
+    }
+
+
+    protected boolean isScriptOrStyle(TagNode tagNode) {
+        String tagName = tagNode.getName();
+        return "script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName);
+    }
+    
+    protected abstract void serialize(TagNode tagNode, Writer writer) throws IOException;
+	
+}
@@ -0,0 +1,75 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+	
+    Redistribution and use of this software in source and binary forms, 
+    with or without modification, are permitted provided that the following 
+    conditions are met:
+	
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+	
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+	
+    * The name of HtmlCleaner may not be used to endorse or promote 
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+    POSSIBILITY OF SUCH DAMAGE.
+	
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.*;
+
+/**
+ * <p>Simple HTML serializer - creates resulting HTML without indenting and/or compacting.</p>
+ */
+public class SimpleHtmlSerializer extends HtmlSerializer {
+
+	boolean escape = true;
+
+	public SimpleHtmlSerializer(CleanerProperties props, boolean escape) {
+		super(props);
+		this.escape = escape;
+	}
+	
+	public SimpleHtmlSerializer(CleanerProperties props) {
+		super(props);
+	}
+	
+    protected void serialize(TagNode tagNode, Writer writer) throws IOException {
+        serializeOpenTag(tagNode, writer, false);
+
+        if ( !isMinimizedTagSyntax(tagNode) ) {
+            for (Object item: tagNode.getAllChildren()) {
+                if ( item instanceof ContentNode) {
+                    String content = item.toString();
+                    writer.write( dontEscape(tagNode) || !escape ? content : escapeText(content) );
+                } else if (item instanceof BaseToken) {
+                    ((BaseToken)item).serialize(this, writer);
+                }
+            }
+
+            serializeEndTag(tagNode, writer, false);
+        }
+    }
+
+}
@@ -0,0 +1,79 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+	
+    Redistribution and use of this software in source and binary forms, 
+    with or without modification, are permitted provided that the following 
+    conditions are met:
+	
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+	
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+	
+    * The name of HtmlCleaner may not be used to endorse or promote 
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+    POSSIBILITY OF SUCH DAMAGE.
+	
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * <p>Simple XML serializer - creates resulting XML without indenting lines.</p>
+ */
+public class SimpleXmlSerializer extends XmlSerializer {
+
+	public SimpleXmlSerializer(CleanerProperties props) {
+		super(props);
+	}
+
+    @Override
+    protected void serialize(TagNode tagNode, Writer writer) throws IOException {
+        serializeOpenTag(tagNode, writer, false);
+        
+        List<? extends BaseToken> tagChildren = tagNode.getAllChildren();
+        if ( !isMinimizedTagSyntax(tagNode) ) {
+            Iterator<? extends BaseToken> childrenIt = tagChildren.iterator();
+            while ( childrenIt.hasNext() ) {
+                Object item = childrenIt.next();
+                   	
+                if (item != null) {
+                	if (item instanceof CData) {
+                		serializeCData((CData)item, tagNode, writer);
+                	} else if ( item instanceof ContentNode ) {
+                        serializeContentToken((ContentNode)item, tagNode, writer);
+                    } else {
+                        ((BaseToken)item).serialize(this, writer);
+                    }
+                }
+            }
+
+            serializeEndTag(tagNode, writer, false);
+        }
+    }
+
+}
@@ -0,0 +1,495 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * <p>This class contains map with special entities used in HTML and their
+ * unicodes.</p>
+ *
+ * Created by: Vladimir Nikic<br/>
+ * Date: November, 2006.
+ */
+public class SpecialEntities {
+
+    public static final SpecialEntities INSTANCE = new SpecialEntities(true, true) {
+        @Override
+        public void put(SpecialEntity specialEntity) {
+            throw new UnsupportedOperationException("cannot add to this instance");
+        }
+    };
+
+    /**
+     * key is the {@link SpecialEntity#getKey()} ( i.e. "quot" )
+     */
+	private Map<String, SpecialEntity> entities = new HashMap<String, SpecialEntity>();
+	/**
+	 * Key is the Integer returned by {@link SpecialEntity#intValue()}
+	 */
+	private Map<Integer, SpecialEntity> entitiesByUnicodeCharcode = new HashMap<Integer, SpecialEntity>();
+	private boolean greek;
+	private boolean math;
+	private int maxEntityLength;
+	public static final char NON_BREAKABLE_SPACE = 160;
+
+	public SpecialEntities(boolean greek, boolean math) {
+	    this.greek = greek;
+	    this.math = math;
+	    _put(new SpecialEntity("null",	0, "", true));
+		_put(new SpecialEntity("nbsp",	NON_BREAKABLE_SPACE, null, true));
+		_put(new SpecialEntity("iexcl",	161, null, true));
+		_put(new SpecialEntity("cent",	162, null, true));
+		_put(new SpecialEntity("pound",	163, null, true));
+		_put(new SpecialEntity("curren",	164, null, true));
+		_put(new SpecialEntity("yen",		165, null, true));
+		_put(new SpecialEntity("brvbar",	166, null, true));
+		_put(new SpecialEntity("sect",	167, null, true));
+		_put(new SpecialEntity("uml",		168, null, true));
+		_put(new SpecialEntity("copy",	169, null, true));
+		_put(new SpecialEntity("ordf",	170, null, true));
+		_put(new SpecialEntity("laquo",	171, null, true));
+		_put(new SpecialEntity("not",		172, null, true));
+		_put(new SpecialEntity("shy",		173, null, true));
+		_put(new SpecialEntity("reg",		174, null, true));
+		_put(new SpecialEntity("macr",	175, null, true));
+		_put(new SpecialEntity("deg",		176, null, true));
+		_put(new SpecialEntity("plusmn",	177, null, true));
+		_put(new SpecialEntity("sup2",	178, null, true));
+		_put(new SpecialEntity("sup3",	179, null, true));
+		_put(new SpecialEntity("acute",	180, null, true));
+		_put(new SpecialEntity("micro",	181, null, true));
+		_put(new SpecialEntity("para",	182, null, true));
+		_put(new SpecialEntity("middot",	183, null, true));
+		_put(new SpecialEntity("cedil",	184, null, true));
+		_put(new SpecialEntity("sup1",	185, null, true));
+		_put(new SpecialEntity("ordm",	186, null, true));
+		_put(new SpecialEntity("raquo",	187, null, true));
+		_put(new SpecialEntity("frac14",	188, null, true));
+		_put(new SpecialEntity("frac12",	189, null, true));
+		_put(new SpecialEntity("frac34",	190, null, true));
+		_put(new SpecialEntity("iquest",	191, null, true));
+		_put(new SpecialEntity("Agrave",	192, null, true));
+		_put(new SpecialEntity("Aacute",	193, null, true));
+		_put(new SpecialEntity("Acirc",	194, null, true));
+		_put(new SpecialEntity("Atilde",	195, null, true));
+
+		_put(new SpecialEntity("Auml",	196, null, true));
+		_put(new SpecialEntity("Aring",	197, null, true));
+		_put(new SpecialEntity("AElig",	198, null, true));
+		_put(new SpecialEntity("Ccedil",	199, null, true));
+		_put(new SpecialEntity("Egrave",	200, null, true));
+		_put(new SpecialEntity("Eacute",	201, null, true));
+		_put(new SpecialEntity("Ecirc",	202, null, true));
+		_put(new SpecialEntity("Euml",	203, null, true));
+		_put(new SpecialEntity("Igrave",	204, null, true));
+		_put(new SpecialEntity("Iacute",	205, null, true));
+		_put(new SpecialEntity("Icirc",	206, null, true));
+		_put(new SpecialEntity("Iuml",	207, null, true));
+		_put(new SpecialEntity("ETH",		208, null, true));
+		_put(new SpecialEntity("Ntilde",	209, null, true));
+		_put(new SpecialEntity("Ograve",	210, null, true));
+		_put(new SpecialEntity("Oacute",	211, null, true));
+		_put(new SpecialEntity("Ocirc",	212, null, true));
+		_put(new SpecialEntity("Otilde",	213, null, true));
+		_put(new SpecialEntity("Ouml",	214, null, true));
+		_put(new SpecialEntity("times",	215, null, true));
+		_put(new SpecialEntity("Oslash",	216, null, true));
+		_put(new SpecialEntity("Ugrave",	217, null, true));
+		_put(new SpecialEntity("Uacute",	218, null, true));
+		_put(new SpecialEntity("Ucirc",	219, null, true));
+		_put(new SpecialEntity("Uuml",	220, null, true));
+		_put(new SpecialEntity("Yacute",	221, null, true));
+		_put(new SpecialEntity("THORN",	222, null, true));
+		_put(new SpecialEntity("szlig",	223, null, true));
+		_put(new SpecialEntity("agrave",	224, null, true));
+		_put(new SpecialEntity("aacute",	225, null, true));
+		_put(new SpecialEntity("acirc",	226, null, true));
+		_put(new SpecialEntity("atilde",	227, null, true));
+		_put(new SpecialEntity("auml",	228, null, true));
+		_put(new SpecialEntity("aring",	229, null, true));
+		_put(new SpecialEntity("aelig",	230, null, true));
+		_put(new SpecialEntity("ccedil",	231, null, true));
+		_put(new SpecialEntity("egrave",	232, null, true));
+		_put(new SpecialEntity("eacute",	233, null, true));
+		_put(new SpecialEntity("ecirc",	234, null, true));
+		_put(new SpecialEntity("euml",	235, null, true));
+		_put(new SpecialEntity("igrave",	236, null, true));
+		_put(new SpecialEntity("iacute",	237, null, true));
+		_put(new SpecialEntity("icirc",	238, null, true));
+		_put(new SpecialEntity("iuml",	239, null, true));
+		_put(new SpecialEntity("eth",		240, null, true));
+		_put(new SpecialEntity("ntilde",	241, null, true));
+		_put(new SpecialEntity("ograve",	242, null, true));
+		_put(new SpecialEntity("oacute",	243, null, true));
+		_put(new SpecialEntity("ocirc",	244, null, true));
+		_put(new SpecialEntity("otilde",	245, null, true));
+		_put(new SpecialEntity("ouml",	246, null, true));
+		_put(new SpecialEntity("divide",	247, null, true));
+		_put(new SpecialEntity("oslash",	248, null, true));
+		_put(new SpecialEntity("ugrave",	249, null, true));
+		_put(new SpecialEntity("uacute",	250, null, true));
+		_put(new SpecialEntity("ucirc",	251, null, true));
+		_put(new SpecialEntity("uuml",	252, null, true));
+		_put(new SpecialEntity("yacute",	253, null, true));
+		_put(new SpecialEntity("thorn",	254, null, true));
+		_put(new SpecialEntity("yuml",	255, null, true));
+
+		_put(new SpecialEntity("OElig",	338, null, true));
+		_put(new SpecialEntity("oelig",	339, null, true));
+		_put(new SpecialEntity("Scaron",	352, null, true));
+		_put(new SpecialEntity("scaron",	353, null, true));
+        _put(new SpecialEntity("Yuml",  376, null, true));
+        _put(new SpecialEntity("fnof",  402, null, true));
+		_put(new SpecialEntity("circ",	710, null, true));
+		_put(new SpecialEntity("tilde",	732, null, true));
+		if ( this.greek ) {
+		    // 913    Alpha   Α   greek capital letter alpha
+		    _put(new SpecialEntity("Alpha", 913, null, true));
+		    // 914 Beta    Β   greek capital letter beta
+		    _put(new SpecialEntity("Beta", 914, null, true));
+		    // 915 Gamma   Γ   greek capital letter gamma
+            _put(new SpecialEntity("Gamma", 915, null, true));
+            // 916 Delta   Δ   greek capital letter delta
+            _put(new SpecialEntity("Delta", 916, null, true));
+            // 917 Epsilon Ε   greek capital letter epsilon
+            _put(new SpecialEntity("Epsilon", 917, null, true));
+            // 918 Zeta    Ζ   greek capital letter zeta
+            _put(new SpecialEntity("Zeta", 918, null, true));
+            // 919 Eta Η   greek capital letter eta
+            _put(new SpecialEntity("Eta", 919, null, true));
+            // 920 Theta   Θ   greek capital letter theta
+            _put(new SpecialEntity("Theta", 920, null, true));
+            // 921 Iota    Ι   greek capital letter iota
+            _put(new SpecialEntity("Iota", 921, null, true));
+            // 922 Kappa   Κ   greek capital letter kappa
+            _put(new SpecialEntity("Kappa", 922, null, true));
+            // 923 Lambda  Λ   greek capital letter lambda
+            _put(new SpecialEntity("Lambda", 923, null, true));
+            // 924 Mu  Μ   greek capital letter mu
+            _put(new SpecialEntity("Mu", 924, null, true));
+            // 925 Nu  Ν   greek capital letter nu
+            _put(new SpecialEntity("Nu", 925, null, true));
+            // 926 Xi  Ξ   greek capital letter xi
+            _put(new SpecialEntity("Xi", 926, null, true));
+            // 927 Omicron Ο   greek capital letter omicron
+            _put(new SpecialEntity("Omicron", 927, null, true));
+            // 928 Pi  Π   greek capital letter pi
+            _put(new SpecialEntity("Pi", 928, null, true));
+            // 929 Rho Ρ   greek capital letter rho
+            _put(new SpecialEntity("Rho", 929, null, true));
+            // there is no Sigmaf, and no U+03A2 character either
+            // 931 Sigma   Σ   greek capital letter sigma
+            _put(new SpecialEntity("Sigma", 931, null, true));
+            // 932 Tau Τ   greek capital letter tau
+            _put(new SpecialEntity("Tau", 932, null, true));
+            // 933 Upsilon Υ   greek capital letter upsilon
+            _put(new SpecialEntity("Upsilon", 933, null, true));
+            // 934 Phi Φ   greek capital letter phi
+            _put(new SpecialEntity("Phi", 934, null, true));
+            // 935 Chi Χ   greek capital letter chi
+            _put(new SpecialEntity("Chi", 935, null, true));
+            // 936 Psi Ψ   greek capital letter psi
+            _put(new SpecialEntity("Psi", 936, null, true));
+            // 937 Omega   Ω   greek capital letter omega
+            _put(new SpecialEntity("Omega", 937, null, true));
+            // 945 alpha   α   greek small letter alpha
+            _put(new SpecialEntity("alpha", 945, null, true));
+            // 946 beta    β   greek small letter beta
+            _put(new SpecialEntity("beta", 946, null, true));
+            // 947 gamma   γ   greek small letter gamma
+            _put(new SpecialEntity("gamma", 947, null, true));
+            // 948 delta   δ   greek small letter delta
+            _put(new SpecialEntity("delta", 948, null, true));
+            // 949 epsilon ε   greek small letter epsilon
+            _put(new SpecialEntity("epsilon", 949, null, true));
+            // 950 zeta    ζ   greek small letter zeta
+            _put(new SpecialEntity("zeta", 950, null, true));
+            // 951 eta η   greek small letter eta
+            _put(new SpecialEntity("eta", 951, null, true));
+            // 952 theta   θ   greek small letter theta
+            _put(new SpecialEntity("theta", 952, null, true));
+            // 953 iota    ι   greek small letter iota
+            _put(new SpecialEntity("iota", 953, null, true));
+            // 954 kappa   κ   greek small letter kappa
+            _put(new SpecialEntity("kappa", 954, null, true));
+            // 955 lambda  λ   greek small letter lambda
+            _put(new SpecialEntity("lambda", 955, null, true));
+            // 956 mu  μ   greek small letter mu
+            _put(new SpecialEntity("mu", 956, null, true));
+            // 957 nu  ν   greek small letter nu
+            _put(new SpecialEntity("nu", 957, null, true));
+            // 958 xi  ξ   greek small letter xi
+            _put(new SpecialEntity("xi", 958, null, true));
+            // 959 omicron ο   greek small letter omicron
+            _put(new SpecialEntity("omicron", 959, null, true));
+            // 960 pi  π   greek small letter pi
+            _put(new SpecialEntity("pi", 960, null, true));
+            // 961 rho ρ   greek small letter rho
+            _put(new SpecialEntity("rho", 961, null, true));
+            // 962 sigmaf  ς   greek small letter final sigma
+            _put(new SpecialEntity("sigmaf", 962, null, true));
+            // 963 sigma   σ   greek small letter sigma
+            _put(new SpecialEntity("sigma", 963, null, true));
+            // 964 tau τ   greek small letter tau
+            _put(new SpecialEntity("tau", 964, null, true));
+            // 965 upsilon υ   greek small letter upsilon
+            _put(new SpecialEntity("upsilon", 965, null, true));
+            // 966 phi φ   greek small letter phi
+            _put(new SpecialEntity("phi", 966, null, true));
+            // 967 chi χ   greek small letter chi
+            _put(new SpecialEntity("chi", 967, null, true));
+            // 968 psi ψ   greek small letter psi
+            _put(new SpecialEntity("psi", 968, null, true));
+            // 969 omega   ω   greek small letter omega
+            _put(new SpecialEntity("omega", 969, null, true));
+            // 977 thetasym    ϑ   greek small letter theta symbol
+            _put(new SpecialEntity("thetasym", 977, null, true));
+            // 978 upsih   ϒ   greek upsilon with hook symbol
+            _put(new SpecialEntity("upsih", 978, null, true));
+            // 982 piv ϖ   greek pi symbol
+            _put(new SpecialEntity("piv", 982, null, true));
+		}
+		_put(new SpecialEntity("ensp",	8194, null, true));
+		_put(new SpecialEntity("emsp",	8195, null, true));
+		_put(new SpecialEntity("thinsp",	8201, null, true));
+		_put(new SpecialEntity("zwnj",	8204, null, true));
+		_put(new SpecialEntity("zwj",		8205, null, true));
+		_put(new SpecialEntity("lrm",		8206, null, true));
+		_put(new SpecialEntity("rlm",		8207, null, true));
+		_put(new SpecialEntity("ndash",	8211, null, true));
+		_put(new SpecialEntity("mdash",	8212, null, true));
+		_put(new SpecialEntity("lsquo",	8216, null, true));
+		_put(new SpecialEntity("rsquo",	8217, null, true));
+		_put(new SpecialEntity("sbquo",	8218, null, true));
+		_put(new SpecialEntity("ldquo",	8220, null, true));
+		_put(new SpecialEntity("rdquo",	8221, null, true));
+		_put(new SpecialEntity("bdquo",	8222, null, true));
+		_put(new SpecialEntity("dagger",	8224, null, true));
+        _put(new SpecialEntity("Dagger",    8225, null, true));
+        _put(new SpecialEntity("bull",    8226, null, true));
+        // three ellipses
+		_put(new SpecialEntity("hellip",	8230, null, true));
+        _put(new SpecialEntity("permil",    8240, null, true));
+        _put(new SpecialEntity("prime",    8242, null, true));
+        _put(new SpecialEntity("Prime",    8243, null, true));
+		_put(new SpecialEntity("lsaquo",	8249, null, true));
+		_put(new SpecialEntity("rsaquo",	8250, null, true));
+        _put(new SpecialEntity("oline",    8254, null, true));
+        _put(new SpecialEntity("frasl",    8260, null, true));
+        _put(new SpecialEntity("euro",  8364, null, true));
+        _put(new SpecialEntity("image",  8465, null, true));
+        _put(new SpecialEntity("weierp",  8472, null, true));
+        _put(new SpecialEntity("real",  8476, null, true));
+        _put(new SpecialEntity("trade", 8482, null, true));
+        _put(new SpecialEntity("alefsym", 8501, null, true));
+        _put(new SpecialEntity("larr", 8592, null, true));
+        _put(new SpecialEntity("uarr", 8593, null, true));
+        _put(new SpecialEntity("rarr", 8594, null, true));
+        _put(new SpecialEntity("darr", 8595, null, true));
+        _put(new SpecialEntity("harr", 8596, null, true));
+        _put(new SpecialEntity("crarr", 8629, null, true));
+        _put(new SpecialEntity("lArr", 8656, null, true));
+        _put(new SpecialEntity("uArr", 8657, null, true));
+        _put(new SpecialEntity("rArr", 8658, null, true));
+        _put(new SpecialEntity("dArr", 8659, null, true));
+        _put(new SpecialEntity("hArr", 8660, null, true));
+        if (this.math) {
+            // 8704 forall  ∀   for all
+            _put(new SpecialEntity("forall", 8704, null, true));
+            //8706    part    ∂   partial differential
+            _put(new SpecialEntity("part", 8706, null, true));
+            //8707    exist   ∃   there exists
+            _put(new SpecialEntity("exist", 8707, null, true));
+            //8709    empty   ∅   empty set = null set = diameter
+            _put(new SpecialEntity("empty", 8709, null, true));
+            //8711    nabla   ∇   nabla = backward difference
+            _put(new SpecialEntity("nabla", 8711, null, true));
+            //8712    isin    ∈   element of
+            _put(new SpecialEntity("isin", 8712, null, true));
+            //8713    notin   ∉   not an element of
+            _put(new SpecialEntity("notin", 8713, null, true));
+            //8715    ni  ∋   contains as member
+            _put(new SpecialEntity("ni", 8715, null, true));
+            //8719    prod    ∏   n-ary product = product sign
+            //prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both
+            _put(new SpecialEntity("prod", 8719, null, true));
+            //8721    sum ∑   n-ary sumation
+            //sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both
+            _put(new SpecialEntity("sum", 8721, null, true));
+            //8722    minus   −   minus sign
+            _put(new SpecialEntity("minus", 8722, null, true));
+            //8727    lowast  ∗   asterisk operator
+            _put(new SpecialEntity("lowast", 8727, null, true));
+            //8730    radic   √   square root = radical sign
+            _put(new SpecialEntity("radic", 8730, null, true));
+            //8733    prop    ∝   proportional to
+            _put(new SpecialEntity("prop", 8733, null, true));
+            //8734    infin   ∞   infinity
+            _put(new SpecialEntity("infin", 8734, null, true));
+            //8736    ang ∠   angle
+            _put(new SpecialEntity("ang", 8736, null, true));
+            //8743    and ∧   logical and = wedge
+            _put(new SpecialEntity("and", 8743, null, true));
+            //8744    or  ∨   logical or = vee
+            _put(new SpecialEntity("or", 8744, null, true));
+            //8745    cap ∩   intersection = cap
+            _put(new SpecialEntity("cap", 8745, null, true));
+            //8746    cup ∪   union = cup
+            _put(new SpecialEntity("cup", 8746, null, true));
+            //8747    int ∫   integral
+            _put(new SpecialEntity("int", 8747, null, true));
+            //8756    there4  ∴   therefore
+            _put(new SpecialEntity("there4", 8756, null, true));
+            //8764    sim ∼   tilde operator = varies with = similar to
+            //tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both
+            _put(new SpecialEntity("sim", 8764, null, true));
+            //8773    cong    ≅   approximately equal to
+            _put(new SpecialEntity("cong", 8773, null, true));
+            //8776    asymp   ≈   almost equal to = asymptotic to
+            _put(new SpecialEntity("asymp", 8776, null, true));
+            //8800    ne  ≠   not equal to
+            _put(new SpecialEntity("ne", 8800, null, true));
+            //8801    equiv   ≡   identical to
+            _put(new SpecialEntity("equiv", 8801, null, true));
+            //8804    le  ≤   less-than or equal to
+            _put(new SpecialEntity("le", 8804, null, true));
+            //8805    ge  ≥   greater-than or equal to
+            _put(new SpecialEntity("ge", 8805, null, true));
+            //8834    sub ⊂   subset of
+            _put(new SpecialEntity("sub", 8834, null, true));
+            //8835    sup ⊃   superset of
+            _put(new SpecialEntity("sup", 8835, null, true));
+            //note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. Should it be, for symmetry? It is in ISOamsn
+            //8836    nsub    ⊄   not a subset of
+            _put(new SpecialEntity("nsub", 8836, null, true));
+            //8838    sube    ⊆   subset of or equal to
+            _put(new SpecialEntity("sube", 8838, null, true));
+            //8839    supe    ⊇   superset of or equal to
+            _put(new SpecialEntity("supe", 8839, null, true));
+            //8853    oplus   ⊕   circled plus = direct sum
+            _put(new SpecialEntity("oplus", 8853, null, true));
+            //8855    otimes  ⊗   circled times = vector product
+            _put(new SpecialEntity("otimes", 8855, null, true));
+            //8869    perp    ⊥   up tack = orthogonal to = perpendicular
+            _put(new SpecialEntity("perp", 8869, null, true));
+            //8901    sdot    ⋅   dot operator
+            _put(new SpecialEntity("sdot", 8901, null, true));
+            //dot operator is NOT the same character as U+00B7 middle dot
+            //8968    lceil   ⌈   left ceiling = apl upstile
+            _put(new SpecialEntity("lceil", 8968, null, true));
+            //8969    rceil   ⌉   right ceiling
+            _put(new SpecialEntity("rceil", 8969, null, true));
+            //8970    lfloor  ⌊   left floor = apl downstile
+            _put(new SpecialEntity("lfloor", 8970, null, true));
+            //8971    rfloor  ⌋   right floor
+            _put(new SpecialEntity("rfloor", 8971, null, true));
+            //9001    lang    〈   left-pointing angle bracket = bra
+            //lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark'
+            _put(new SpecialEntity("lang", 9001, null, true));
+            //9002    rang    〉   right-pointing angle bracket = ket
+            //rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark'
+            _put(new SpecialEntity("rang", 9002, null, true));
+            //9674    loz ◊   lozenge
+            _put(new SpecialEntity("loz", 9674, null, true));
+            //black here seems to mean filled as opposed to hollow
+            //9824    spades  ♠   black spade suit
+            _put(new SpecialEntity("spades", 9824, null, true));
+            //9827    clubs   ♣   black club suit = shamrock
+            _put(new SpecialEntity("clubs", 9827, null, true));
+            //9829    hearts  ♥   black heart suit = valentine
+            _put(new SpecialEntity("hearts", 9829, null, true));
+            //9830    diams   ♦   black diamond suit
+            _put(new SpecialEntity("diams", 9830, null, true));
+        }
+        _put(new SpecialEntity("amp",  '&', null, false));
+        _put(new SpecialEntity("lt", '<', null, false));
+        _put(new SpecialEntity("gt",  '>', null, false));
+        _put(new SpecialEntity("quot",  '"', null, false));
+        // this is xml only -- apos appearing in html needs to be converted to ' or maybe &#39; to be universally safe
+        // may need to special case for html attributes that use ' as surrounding delimeter on attribute value (instead of " ) : <a href='javascript:foo("bar'")' >wierd link</a>
+        _put(new SpecialEntity("apos",  '\'', "'", false));
+	}
+
+	/**
+	 *
+	 * @param seq may have a leading & and/or trailing ; ( those will be removed prior to comparision)
+	 * @return {@link SpecialEntity} if found.
+	 */
+	public SpecialEntity getSpecialEntity(String seq) {
+		if (seq.length() == 0) return null;
+	    int startIndex = seq.charAt(0) == '&'?1:0;
+        int semiIndex = seq.indexOf(';');
+        String entity;
+        if (semiIndex < 0) {
+            entity = seq.substring(startIndex);
+        } else {
+            entity = seq.substring(startIndex, semiIndex);
+        }
+        SpecialEntity specialEntity = entities.get(entity);
+	    return specialEntity;
+	}
+
+	public SpecialEntity getSpecialEntityByUnicode(int unicodeCharcode) {
+	    return this.entitiesByUnicodeCharcode.get(unicodeCharcode);
+	}
+
+	public void put(SpecialEntity specialEntity) {
+	    _put(specialEntity);
+	}
+
+	/**
+     * @param specialEntity
+     */
+    private void _put(SpecialEntity specialEntity) {
+        SpecialEntity old;
+        old = entities.put(specialEntity.getKey(), specialEntity);
+        if ( old != null ) {
+            throw new HtmlCleanerException("replaced "+old+" with "+specialEntity);
+        }
+        old = entitiesByUnicodeCharcode.put(specialEntity.intValue(), specialEntity);
+        if ( old != null ) {
+            throw new HtmlCleanerException("replaced "+old+" with "+specialEntity);
+        }
+        this.maxEntityLength = Math.max(this.maxEntityLength,specialEntity.getKey().length());
+    }
+    public int getMaxEntityLength() {
+        return maxEntityLength;
+    }
+}
@@ -0,0 +1,135 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+public class SpecialEntity{
+    private final String key;
+    private final int intCode;
+    // escaped value outputed when generating html
+    private final String htmlString;
+    private boolean htmlSpecialEntity;
+    // escaped value when outputting html
+    private final String escapedXmlString;
+
+    /**
+     *
+     * @param key value between & and the ';' example 'amp' for '&amp;'
+     * @param intCode
+     * @param htmlString
+     * @param htmlSpecialEntity entity is affected by translateSpecialEntities property setting.
+     */
+    public SpecialEntity(String key, int intCode, String htmlString, boolean htmlSpecialEntity) {
+        this.key = key;
+        this.intCode = intCode;
+        String str = "&" + key +";";
+        if ( htmlString != null) {
+            this.htmlString = htmlString;
+        } else {
+            this.htmlString = str;
+        }
+        if ( htmlSpecialEntity ) {
+            this.escapedXmlString = String.valueOf((char)this.intCode);
+        } else {
+            this.escapedXmlString = str;
+        }
+        this.htmlSpecialEntity = htmlSpecialEntity;
+    }
+
+    /**
+     * @return the key
+     */
+    public String getKey() {
+        return key;
+    }
+
+    /**
+     * @return the intCode
+     */
+    public int intValue() {
+        return intCode;
+    }
+
+    /**
+     * @return the domString
+     */
+    public String getHtmlString() {
+        return htmlString;
+    }
+
+    public String getEscapedXmlString() {
+        return this.escapedXmlString;
+    }
+
+    public String getEscaped(boolean htmlEscaped) {
+        return htmlEscaped?this.getHtmlString():this.getEscapedXmlString();
+    }
+
+    /**
+     * @return the translateSpecialEntities
+     */
+    public boolean isHtmlSpecialEntity() {
+        return htmlSpecialEntity;
+    }
+
+    /**
+     * @return {@link #intValue()} cast to an char
+     */
+    public char charValue() {
+        return (char) intValue();
+    }
+    /**
+     * @return Numeric Character Reference in decimal format
+     */
+    public String getDecimalNCR() {
+        return "&#" + intCode + ";";
+    }
+
+    /**
+     * @return Numeric Character Reference in hex format
+     */
+    public String getHexNCR() {
+        return "&#x" + Integer.toHexString(intCode) + ";";
+    }
+
+    /**
+     * @return Escaped value of the entity
+     */
+    public String getEscapedValue() {
+        return "&" + key + ";";
+    }
+}
@@ -0,0 +1,447 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.*;
+
+/**
+ * <p>
+ * Class contains information about single HTML tag.<br/>
+ * It also contains rules for tag balancing. For each tag, list of dependent
+ * tags may be defined. There are several kinds of dependencies used to reorder
+ * tags:
+ * <ul>
+ *      <li>
+ * 		  fatal tags - required outer tag - the tag will be ignored during
+ *        parsing (will be skipped) if this fatal tag is missing. For example, most web
+ *        browsers ignore elements TD, TR, TBODY if they are not in the context of TABLE tag.
+ *      </li>
+ *      <li>
+ *        required enclosing tags - if there is no such, it is implicitly
+ *        created. For example if TD is out of TR - open TR is created before.
+ *      </li>
+ *      <li>
+ *        forbidden tags - it is not allowed to occur inside - for example
+ *        FORM cannot be inside other FORM and it will be ignored during cleanup.
+ *      </li>
+ *      <li>
+ *        allowed children tags - for example TR allows TD and TH. If there
+ *        are some dependent allowed tags defined then cleaner ignores other tags, treating
+ *        them as not allowed, unless they are in some other relationship with this tag.
+ *      </li>
+ *      <li>
+ *        preferred child tag - where a child tag doesn't match, but we want to by default
+ *        insert an intervening tag rather than just move it outside. For example, LI in UL, TD in TR.
+ *      </li>
+ *      <li>
+ *        higher level tags - for example for TR higher tags are THEAD, TBODY, TFOOT.
+ *      </li>
+ *      <li>
+ *        tags that must be closed and copied - for example, in
+ *        <code>&lt;a href="#"&gt;&lt;div&gt;....</code> tag A must be closed before DIV but
+ *        copied again inside DIV.
+ *      </li>
+ *      <li>
+ *        tags that must be closed before closing this tag and copied again after -
+ *        for example, in <code>&lt;i&gt;&lt;b&gt;at&lt;/i&gt; first&lt;/b&gt; text </code>
+ *        tag B must be closed before closing I, but it must be copied again after resulting
+ *        finally in sequence: <code>&lt;i&gt;&lt;b&gt;at&lt;/b&gt;&lt;/i&gt;&lt;b&gt; first&lt;/b&gt; text </code>.
+ *      </li>
+ * </ul>
+ * </p>
+ *
+ * <p>
+ * Tag TR for instance (table row) may define the following dependencies:
+ *      <ul>
+ *          <li>fatal tag is <code>table</code></li>
+ *          <li>required enclosing tag is <code>tbody</code></li>
+ *          <li>allowed children tags are <code>td,th</code></li>
+ *          <li>higher level tags are <code>thead,tfoot</code></li>
+ *          <li>tags that muste be closed before are <code>tr,td,th,caption,colgroup</code></li>
+ *      </ul>
+ * meaning the following: <br>
+ *   <ul>
+ *      <li><code>tr</code> must be in context of <code>table</code>, otherwise it will be ignored,</li>
+ *      <li><code>tr</code> may can be directly inside <code>tbody</code>, <code>tfoot</code> and <code>thead</code>,
+ *          otherwise <code>tbody</code> will be implicitly created in front of it.</li>
+ *      <li><code>tr</code> can contain <code>td</code> and <code>th</code>, all other tags and content will be pushed out of current
+ *      limiting context, in the case of html tables, in front of enclosing <code>table</code> tag.</li>
+ *      <li>if previous open tag is one of <code>tr</code>, <code>caption</code> or <code>colgroup</code>, it will be implicitly closed.</li>
+ *   </ul>
+ * </p>
+ */
+public class TagInfo {
+
+    public String getAssumedNamespace() {
+		return assumedNamespace;
+	}
+
+	public void setAssumedNamespace(String assumedNamespace) {
+		this.assumedNamespace = assumedNamespace;
+	}
+	
+    public String getAssumedNamespacePrefix() {
+		return assumedNamespacePrefix;
+	}
+
+	public void setAssumedNamespacePrefix(String assumedNamespacePrefix) {
+		this.assumedNamespacePrefix = assumedNamespacePrefix;
+	}
+
+
+	private String name;
+    private ContentType contentType;
+    private Set<String> mustCloseTags = new HashSet<String>();
+    private Set<String> higherTags = new HashSet<String>();
+    private Set<String> childTags = new HashSet<String>();
+    private Set<String> permittedTags = new HashSet<String>();
+    private Set<String> copyTags = new HashSet<String>();
+    private Set<String> continueAfterTags = new HashSet<String>();
+    private BelongsTo belongsTo = BelongsTo.BODY;
+    private Set<String>requiredParentTags = new HashSet<String>();
+    private Set<String>fatalTags = new HashSet<String>();
+    private String preferredChildTag = null;
+    private String assumedNamespace = null;
+    private String assumedNamespacePrefix = null;
+    private boolean deprecated;
+    private boolean unique;
+    private CloseTag closeTag;
+    private Display display;
+
+    public TagInfo(String name, ContentType contentType, BelongsTo belongsTo, boolean deprecated, boolean unique, boolean ignorePermitted, CloseTag closeTag, Display display) {
+        this.name = name;
+        this.contentType = contentType;
+        this.belongsTo = belongsTo;
+        this.deprecated = deprecated;
+        this.unique = unique;
+        this.closeTag = closeTag;
+        this.display = display;
+    }
+
+    public void defineFatalTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.fatalTags.add(currTag);
+            this.higherTags.add(currTag);
+        }
+    }
+
+    public void defineRequiredEnclosingTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.requiredParentTags.add(currTag);
+            this.higherTags.add(currTag);
+        }
+    }
+
+    public void defineForbiddenTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.permittedTags.add(currTag);
+        }
+    }
+
+    public void defineAllowedChildrenTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.childTags.add(currTag);
+        }
+    }
+
+    public void defineHigherLevelTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.higherTags.add(currTag);
+        }
+    }
+
+    public void defineCloseBeforeCopyInsideTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.copyTags.add(currTag);
+            this.mustCloseTags.add(currTag);
+        }
+    }
+
+    public void defineCloseInsideCopyAfterTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.continueAfterTags.add(currTag);
+        }
+    }
+
+    public void defineCloseBeforeTags(String commaSeparatedListOfTags) {
+        StringTokenizer tokenizer = new StringTokenizer(commaSeparatedListOfTags.toLowerCase(), ",");
+        while (tokenizer.hasMoreTokens()) {
+            String currTag = tokenizer.nextToken();
+            this.mustCloseTags.add(currTag);
+        }
+    }
+
+    // getters and setters
+
+    public Display getDisplay() {
+    	return display;
+    }
+
+    public void setDisplay(Display display) {
+    	this.display = display;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+
+	public void setName(String name) {
+        this.name = name;
+    }
+
+    public ContentType getContentType() {
+        return contentType;
+    }
+
+    public Set<String> getMustCloseTags() {
+        return mustCloseTags;
+    }
+
+    public void setMustCloseTags(Set<String> mustCloseTags) {
+        this.mustCloseTags = mustCloseTags;
+    }
+
+    public Set<String> getHigherTags() {
+        return higherTags;
+    }
+
+    public void setHigherTags(Set<String> higherTags) {
+        this.higherTags = higherTags;
+    }
+
+    public Set<String> getChildTags() {
+        return childTags;
+    }
+
+    public void setChildTags(Set<String> childTags) {
+        this.childTags = childTags;
+    }
+
+    public Set<String> getPermittedTags() {
+        return permittedTags;
+    }
+
+    public void setPermittedTags(Set<String> permittedTags) {
+        this.permittedTags = permittedTags;
+    }
+
+    public Set<String> getCopyTags() {
+        return copyTags;
+    }
+
+    public void setCopyTags(Set<String> copyTags) {
+        this.copyTags = copyTags;
+    }
+
+    public Set<String> getContinueAfterTags() {
+        return continueAfterTags;
+    }
+
+    public void setContinueAfterTags(Set<String> continueAfterTags) {
+        this.continueAfterTags = continueAfterTags;
+    }
+
+   public Set<String> getRequiredParentTags() {
+        return requiredParentTags;
+   }
+
+    public void setRequiredParent(String requiredParent) {
+        this.requiredParentTags.add(requiredParent);
+    }
+
+    public BelongsTo getBelongsTo() {
+        return belongsTo;
+    }
+
+    public void setBelongsTo(BelongsTo belongsTo) {
+        this.belongsTo = belongsTo;
+    }
+    
+    public Set<String> getFatalTags(){
+    	return this.fatalTags;
+    }
+    
+    public boolean isFatalTag(String tag){
+    	for (String fatalTag:this.fatalTags){
+    		if (tag.equals(fatalTag)) return true;
+    	}
+    	return false;
+    }
+
+    public void setFatalTag(String fatalTag) {
+        this.fatalTags.add(fatalTag);
+    }
+
+    public boolean isDeprecated() {
+        return deprecated;
+    }
+
+    public void setDeprecated(boolean deprecated) {
+        this.deprecated = deprecated;
+    }
+
+    public boolean isUnique() {
+        return unique;
+    }
+
+    public void setUnique(boolean unique) {
+        this.unique = unique;
+    }
+
+    public boolean isEmptyTag() {
+        return ContentType.none == contentType;
+    }
+
+    // other functionality
+
+    boolean allowsBody() {
+    	return ContentType.none != contentType;
+    }
+
+    boolean isHigher(String tagName) {
+    	return higherTags.contains(tagName);
+    }
+
+    boolean isCopy(String tagName) {
+    	return copyTags.contains(tagName);
+    }
+
+    boolean hasCopyTags() {
+    	return !copyTags.isEmpty();
+    }
+
+    boolean isContinueAfter(String tagName) {
+    	return continueAfterTags.contains(tagName);
+    }
+
+    boolean hasPermittedTags() {
+    	return !permittedTags.isEmpty();
+    }
+
+    boolean isHeadTag() {
+    	return belongsTo == BelongsTo.HEAD;
+    }
+
+    boolean isHeadAndBodyTag() {
+    	return belongsTo == BelongsTo.HEAD || belongsTo == BelongsTo.HEAD_AND_BODY;
+    }
+
+    boolean isMustCloseTag(TagInfo tagInfo) {
+        if (tagInfo != null) {
+            return mustCloseTags.contains( tagInfo.getName() ) || tagInfo.contentType == ContentType.text;
+        }
+
+        return false;
+    }
+
+    /**
+     *
+     * @param token
+     * @return true if the passed token is allowed to be nested in a Tag with this TagInfo.
+     */
+    boolean allowsItem(BaseToken token) {
+        if ( contentType != ContentType.none && token instanceof TagToken ) {
+            TagToken tagToken = (TagToken) token;
+            String tagName = tagToken.getName();
+            if ( "script".equals(tagName) ) {
+                return true;
+            }
+        }
+
+        switch (contentType) {
+        case all:
+            if ( !childTags.isEmpty() ) {
+                if ( token instanceof TagToken) {
+                    return childTags.contains( ((TagToken)token).getName() );
+                }
+    		} else if ( !permittedTags.isEmpty() ) {
+                if ( token instanceof TagToken) {
+                    return !permittedTags.contains( ((TagToken)token).getName() );
+                }
+    		}
+           return true;    			
+        case text:
+    		return !(token instanceof TagToken);
+        case none:
+            if ( token instanceof ContentNode ) {
+                // allow white space in outputed html
+                return ( (ContentNode)token).isBlank();
+            } else if (!(token instanceof TagToken)) {
+                // allow directives.
+                return true;
+            }
+        default:
+            return false;
+        }
+    }
+
+    boolean allowsAnything() {
+    	return ContentType.all == contentType && childTags.isEmpty();
+    }
+
+    /**
+     * @return True if the tag can be minimized
+     */
+    public boolean isMinimizedTagPermitted() {
+        return this.closeTag.isMinimizedTagPermitted();
+    }
+
+	public String getPreferredChildTag() {
+		return preferredChildTag;
+	}
+
+	public void setPreferredChildTag(String preferredChildTag) {
+		this.preferredChildTag = preferredChildTag;
+	}
+
+}
@@ -0,0 +1,889 @@
+/*  Copyright (c) 2006-2014, HTMLCleaner project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+    
+    http://htmlcleaner.sourceforge.net/
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.*;
+import java.util.Map.Entry;
+
+import org.htmlcleaner.conditional.ITagNodeCondition;
+import org.htmlcleaner.conditional.TagAllCondition;
+import org.htmlcleaner.conditional.TagNodeAttExistsCondition;
+import org.htmlcleaner.conditional.TagNodeAttValueCondition;
+import org.htmlcleaner.conditional.TagNodeNameCondition;
+
+/**
+ * <p>
+ *      XML node tag - basic node of the cleaned HTML tree. At the same time, it represents start tag token
+ *      after HTML parsing phase and before cleaning phase. After cleaning process, tree structure remains
+ *      containing tag nodes (TagNode class), content (text nodes - ContentNode), comments (CommentNode)
+ *      and optionally doctype node (DoctypeToken).
+ * </p>
+ */
+public class TagNode extends TagToken implements HtmlNode {
+    private final LinkedHashMap<String, String> attributes = new LinkedHashMap<String, String>();
+    private final List<BaseToken> children = new ArrayList<BaseToken>();
+    private DoctypeToken docType;
+    private List<BaseToken> itemsToMove;
+    private Map<String, String> nsDeclarations;
+
+    private transient boolean isFormed;
+
+    /**
+     * Used to indicate a start tag that was auto generated because {@link TagInfo#isContinueAfter(String)}(closedTag.getName()) returned true
+     * For example,
+     * <pre>
+     * <b><i>foo</b>bar
+     * </pre>
+     * would result in a new <i> being created resulting in
+     * <pre>
+     * <b><i>foo</i></b><i>bar</i>
+     * </pre>
+     * The second opening <i> tag is marked as autogenerated. This allows the autogenerated tag to be removed if it is unneeded.
+     */
+    private boolean autoGenerated;
+    
+    /**
+     * This flag is set if we are using namespace aware setting, and the tagnode belongs
+     * to a non-HTML namespace. 
+     */
+    private boolean isForeignMarkup;
+    
+    /**
+     * This flag is set if foreignMarkup is set; if it is false it means that the tagnode tree has not been built and so
+     * it isn't known whether this node is a HTML node or foreign markup such as SVG.
+     */
+    private boolean foreignMarkupFlagSet = false;
+
+    /**
+     * This flag is set if attribute values should be trimmed.
+     */
+    private boolean isTrimAttributeValues = true;
+    
+    /**
+     * Indicates that the node was marked to be pruned out of the tree.
+     */
+    private boolean pruned;
+
+    /**
+     * Indicates that the node is a copy of another node.
+     * @see #makeCopy()
+     */
+    private final boolean isCopy;
+
+    public TagNode(String name) {
+        this(name, false);
+    }
+
+    private TagNode(String name, boolean isCopy) {
+        super(name);
+        this.isCopy = isCopy;
+    }
+    
+	/* (non-Javadoc)
+	 * @see org.htmlcleaner.TagToken#getName()
+	 */
+	@Override
+	public String getName() {
+		//
+		// If this is foreign markup (e.g. SVG) we return the
+		// original name, otherwise we return it in lower case
+		//
+		if (this.isForeignMarkup){
+			return name;
+		} else {
+			return name == null ? null: name.toLowerCase();
+		}
+	}
+    
+
+    /**
+     * @param attName
+     * @return Value of the specified attribute, or null if it this tag doesn't contain it.
+     */
+    public String getAttributeByName(String attName) {    	
+    	if (attName == null) return null;
+    	//
+    	// We have to do case-insensitive comparisons
+    	//	
+        return attName != null ? (String) getAttributesInLowerCase().get(attName.toLowerCase()) : null;
+    }
+
+    /**
+     * Returns the attributes of the tagnode. 
+     * 
+     * @return Map instance containing all attribute name/value pairs.
+     */
+    public Map<String, String> getAttributes() {
+    	return new LinkedHashMap<String, String>(this.attributes);
+    }
+    
+    /**
+     * Returns the attributes of the tagnode in lower case. 
+     * 
+     * @return Map instance containing all attribute name/value pairs, with attribute names transformed to lower case
+     */
+    public Map<String, String> getAttributesInLowerCase(){
+    	return attributesToLowerCase();
+    }
+
+    /**
+     * Replace the current set of attributes with a new set. 
+     * @param attributes
+     */
+    public void setAttributes(Map<String, String> attributes) {
+
+    	//
+    	// If we haven't yet built the tree, we don't know if this
+    	// element is "foreign markup". In this case we don't want
+    	// to overwrite attributes with the same version with a lower
+    	// cased name when its set by the transforms processor.
+    	//
+
+    	//
+    	// We're calling this method after the tree has been built,
+    	// so its safe to just set the attributes
+    	//
+    	if (foreignMarkupFlagSet){
+    		replaceAttributes(attributes);
+    	} else {
+    		//
+    		// The foreign markup flag hasn't been set, so instead of just
+    		// replacing the contents of the attributes map, we iterate
+    		// over it and use the original case name from the existing
+    		// attributes map where it exists
+    		//
+
+    		//
+    		// First create a map to hold the processed map contents
+    		//
+    		LinkedHashMap<String, String> processedAttributes = new LinkedHashMap<String, String>();
+
+    		//
+    		// Iterate over the keys in the map provided by the transforms processor
+    		// and add them to the set of processed keys
+    		//
+    		for (Map.Entry<String, String> entry : attributes.entrySet()){
+    			String key = entry.getKey();
+    			if (Thread.currentThread().isInterrupted()) {
+    	    		// Interruption: if the attributes.keySet() is large this loop will take a lot of time
+    				handleInterruption();
+                	return;
+                }
+    			String keyToSet = key; // the key to set
+    			String value = attributes.get(key); // the value to set
+    			//value = Utils.deserializeEntities(value, true);
+
+    			//
+    			// Check to see if the key exists in the current attribute set
+    			// with different casing. If so, we keep the casing
+    			//
+    			if (!foreignMarkupFlagSet){
+    				for (String existingKey: this.attributes.keySet()){
+    					if (existingKey.equalsIgnoreCase(key)){
+    						keyToSet = existingKey;
+    					}
+    				}
+    			}
+    			//
+    			// If we have duplicates, keep the first value
+    			//
+    			if (!processedAttributes.containsKey(keyToSet)){
+    				processedAttributes.put(keyToSet, value); 
+    			}
+
+    		}
+    		replaceAttributes(processedAttributes);
+    	}
+    }
+    
+    /**
+     *    
+     * Clears existing attributes and puts replacement attributes
+     * @param attributes the attributes to set
+     */
+    private void replaceAttributes(Map<String, String> attributes){
+
+    	this.attributes.clear();
+    	this.attributes.putAll(attributes);    	
+    }
+
+    /**
+     * Checks existence of specified attribute.
+     *
+     * @param attName
+     * @return true if TagNode has attribute
+     */
+    public boolean hasAttribute(String attName) {
+    	if (attName == null) return false;
+    	
+    	//
+    	// We have to do case-insensitive comparisons
+    	//
+    	for (String key: attributes.keySet()){
+    		if (key.equalsIgnoreCase(attName)) return true;
+    	}
+    	
+    	return false;
+    }
+
+    /**
+     * Adds specified attribute to this tag or overrides existing one.
+     * 
+     * @param attName
+     * @param attValue
+     */
+    @Override
+    public void addAttribute(String attName, String attValue) {
+        if (attName != null) {
+            String trim = attName.trim();
+            if (!isForeignMarkup && foreignMarkupFlagSet) trim = trim.toLowerCase();
+            String value = attValue == null ? "" : attValue;
+            if (isTrimAttributeValues) value = value.trim().replaceAll("\\p{Cntrl}", " ");
+            if (trim.length() != 0) {
+            	//
+            	// If there is already an entry, keep the existing value rather than
+            	// overwrite it.
+            	//
+            	if (!attributes.containsKey(trim)){
+            		attributes.put(trim, value);
+            	}
+            }
+        }
+    }
+
+    /**
+     * Removes specified attribute from this tag.
+     *
+     * @param attName
+     */
+    public void removeAttribute(String attName) {
+        if (attName != null && !"".equals(attName.trim())) {
+            attributes.remove(attName.toLowerCase());
+        }
+    }
+
+    /**
+     * @return List of child TagNode objects.
+     * @deprecated use {@link TagNode#getChildTagList()}, will be refactored and possibly removed in
+     *             future versions. TODO This method should be refactored because is does not
+     *             properly match the commonly used Java's getter/setter strategy.
+     */
+    @Deprecated
+    public List<TagNode> getChildren() {
+        return getChildTagList();
+    }
+
+    public void setChildren(List<? extends BaseToken> children) {
+    	this.children.clear();
+        this.children.addAll(children);
+    }
+
+    public List<? extends BaseToken> getAllChildren() {
+        return children;
+    }
+
+    /**
+     * @return List of child TagNode objects.
+     */
+    public List<TagNode> getChildTagList() {
+        List<TagNode> childTagList = new ArrayList<TagNode>();
+        for (Object item: children) {
+            if (item instanceof TagNode) {
+                childTagList.add((TagNode) item);
+            }
+        }
+
+        return childTagList;
+    }
+
+    /**
+     * @return Whether this node has child elements or not.
+     */
+    public boolean hasChildren() {
+        return !children.isEmpty();
+    }
+
+    /**
+     * @return An array of child TagNode instances.
+     */
+    public TagNode[] getChildTags() {
+        List<TagNode> childTagList = getChildTagList();
+        TagNode childrenArray[] = new TagNode[childTagList.size()];
+        for (int i = 0; i < childTagList.size(); i++) {
+            childrenArray[i] = (TagNode) childTagList.get(i);
+        }
+
+        return childrenArray;
+    }
+
+    /**
+     * @return Text content of this node and it's subelements.
+     */
+    public CharSequence getText() {
+        StringBuilder text = new StringBuilder();
+        for (Object item :children) {
+            if (item instanceof ContentNode) {
+                text.append(((ContentNode) item).getContent());
+            } else if (item instanceof TagNode) {
+                CharSequence subtext = ((TagNode) item).getText();
+                text.append(subtext);
+            }
+        }
+
+        return text;
+    }
+
+    /**
+     * @param child Child to find index of
+     * @return Index of the specified child node inside this node's children, -1 if node is not the
+     *         child
+     */
+    public int getChildIndex(HtmlNode child) {
+        int index = 0;
+        for (Object curr : children) {
+            if (curr == child) {
+                return index;
+            }
+            index++;
+        }
+        return -1;
+    }
+
+    /**
+     * Inserts specified node at specified position in array of children
+     *
+     * @param index
+     * @param childToAdd
+     */
+    public void insertChild(int index, HtmlNode childToAdd) {
+        children.add(index, childToAdd);
+    }
+
+    /**
+     * Inserts specified node in the list of children before specified child
+     *
+     * @param node Child before which to insert new node
+     * @param nodeToInsert Node to be inserted at specified position
+     */
+    public void insertChildBefore(HtmlNode node, HtmlNode nodeToInsert) {
+        int index = getChildIndex(node);
+        if (index >= 0) {
+            insertChild(index, nodeToInsert);
+        }
+    }
+
+    /**
+     * Inserts specified node in the list of children after specified child
+     *
+     * @param node Child after which to insert new node
+     * @param nodeToInsert Node to be inserted at specified position
+     */
+    public void insertChildAfter(HtmlNode node, HtmlNode nodeToInsert) {
+        int index = getChildIndex(node);
+        if (index >= 0) {
+            insertChild(index + 1, nodeToInsert);
+        }
+    }
+
+    public DoctypeToken getDocType() {
+        return docType;
+    }
+
+    public void setDocType(DoctypeToken docType) {
+        this.docType = docType;
+    }
+
+    public void addChild(Object child) {
+        if (child == null) {
+            return;
+        }
+        if (child instanceof List) {
+            addChildren((List) child);
+        } else if (child instanceof ProxyTagNode) {
+            children.add(((ProxyTagNode) child).getToken());
+        } else if (child instanceof BaseToken){
+            children.add((BaseToken)child);
+            if (child instanceof TagNode) {
+                TagNode childTagNode = (TagNode) child;
+                childTagNode.parent = this;
+            }
+        } else {
+        	throw new RuntimeException("Attempted to add invalid child object to TagNode; class="+child.getClass());
+        }
+    }
+
+    /**
+     * Add all elements from specified list to this node.
+     *
+     * @param newChildren
+     */
+    public void addChildren(List newChildren) {
+        if (newChildren != null) {
+            for (Object child: newChildren) {
+                addChild(child);
+            }
+        }
+    }
+
+    /**
+     * Finds first element in the tree that satisfy specified condition.
+     *
+     * @param condition
+     * @param isRecursive
+     * @return First TagNode found, or null if no such elements.
+     */
+    private TagNode findElement(ITagNodeCondition condition, boolean isRecursive) {
+        if (condition != null) {
+            for (Object item : children) {
+                if (item instanceof TagNode) {
+                    TagNode currNode = (TagNode) item;
+                    if (condition.satisfy(currNode)) {
+                        return currNode;
+                    } else if (isRecursive) {
+                        TagNode inner = currNode.findElement(condition, isRecursive);
+                        if (inner != null) {
+                            return inner;
+                        }
+                    }
+                }
+            }
+        }
+        return null;
+    }
+    
+    /**
+     * Get all elements in the tree that satisfy specified condition.
+     * @param condition
+     * @param isRecursive
+     * @return List of TagNode instances.
+     */
+    private List<TagNode> findMatchingTagNodes(ITagNodeCondition condition, boolean isRecursive){
+        List<TagNode> result = new LinkedList<TagNode>();
+        if (condition == null) {
+            return result;
+        }
+
+        for (Object item : children) {
+            if (item instanceof TagNode) {
+                TagNode currNode = (TagNode) item;
+                if (condition.satisfy(currNode)) {
+                    result.add(currNode);
+                }
+                if (isRecursive) {
+                    List<TagNode> innerList = currNode.findMatchingTagNodes(condition, isRecursive);
+                    if (innerList != null && innerList.size() > 0) {
+                        result.addAll(innerList);
+                    }
+                }
+            }
+        }
+
+        return result;	
+    }
+
+    /**
+     * Get all elements in the tree that satisfy specified condition.
+     *
+     * @param condition
+     * @param isRecursive
+     * @return List of TagNode instances with specified name.
+     */
+    public List<? extends TagNode> getElementList(ITagNodeCondition condition, boolean isRecursive) {
+        return findMatchingTagNodes(condition, isRecursive);
+    }
+
+    /**
+     * @param condition
+     * @param isRecursive
+     * @return The array of all subelements that satisfy specified condition.
+     */
+    private TagNode[] getElements(ITagNodeCondition condition, boolean isRecursive) {
+        final List<TagNode> list = findMatchingTagNodes(condition, isRecursive);
+        TagNode array[];
+        if (list == null) {
+            array = new TagNode[0];
+        } else {
+            array = (TagNode[]) list.toArray(new TagNode[list.size()]);
+        }
+        return array;
+    }
+
+    public List<? extends TagNode> getAllElementsList(boolean isRecursive) {
+        return getElementList(new TagAllCondition(), isRecursive);
+    }
+
+    public TagNode[] getAllElements(boolean isRecursive) {
+        return getElements(new TagAllCondition(), isRecursive);
+    }
+
+    public TagNode findElementByName(String findName, boolean isRecursive) {
+        return findElement(new TagNodeNameCondition(findName), isRecursive);
+    }
+
+    public List<? extends TagNode> getElementListByName(String findName, boolean isRecursive) {
+        return getElementList(new TagNodeNameCondition(findName), isRecursive);
+    }
+
+    public TagNode[] getElementsByName(String findName, boolean isRecursive) {
+        return getElements(new TagNodeNameCondition(findName), isRecursive);
+    }
+
+    public TagNode findElementHavingAttribute(String attName, boolean isRecursive) {
+        return findElement(new TagNodeAttExistsCondition(attName), isRecursive);
+    }
+
+    public List<? extends TagNode> getElementListHavingAttribute(String attName, boolean isRecursive) {
+        return getElementList(new TagNodeAttExistsCondition(attName), isRecursive);
+    }
+
+    public TagNode[] getElementsHavingAttribute(String attName, boolean isRecursive) {
+        return getElements(new TagNodeAttExistsCondition(attName), isRecursive);
+    }
+
+    public TagNode findElementByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
+        return findElement(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
+    }
+
+    public List<? extends TagNode> getElementListByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
+        return getElementList(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
+    }
+
+    public TagNode[] getElementsByAttValue(String attName, String attValue, boolean isRecursive, boolean isCaseSensitive) {
+        return getElements(new TagNodeAttValueCondition(attName, attValue, isCaseSensitive), isRecursive);
+    }
+
+    /**
+     * Evaluates XPath expression on give node. <br>
+     * <em>
+     *  This is not fully supported XPath parser and evaluator.
+     *  Examples below show supported elements:
+     * </em> <code>
+     * <ul>
+     *      <li>//div//a</li>
+     *      <li>//div//a[@id][@class]</li>
+     *      <li>/body/*[1]/@type</li>
+     *      <li>//div[3]//a[@id][@href='r/n4']</li>
+     *      <li>//div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a</li>
+     *      <li>//div[2]/@*[2]</li>
+     *      <li>data(//div//a[@id][@class])</li>
+     *      <li>//p/last()</li>
+     *      <li>//body//div[3][@class]//span[12.2<position()]/@id</li>
+     *      <li>data(//a['v' < @id])</li>
+     * </ul>
+     * </code>
+     *
+     * @param xPathExpression
+     * @return result of XPather evaluation.
+     * @throws XPatherException
+     */
+    public Object[] evaluateXPath(String xPathExpression) throws XPatherException {
+        return new XPather(xPathExpression).evaluateAgainstNode(this);
+    }
+
+    /**
+     * Remove this node from the tree.
+     *
+     * @return True if element is removed (if it is not root node).
+     */
+    public boolean removeFromTree() {
+        return parent != null ? parent.removeChild(this) : false;
+    }
+
+    /**
+     * Remove specified child element from this node.
+     *
+     * @param child
+     * @return True if child object existed in the children list.
+     */
+    public boolean removeChild(Object child) {
+        return this.children.remove(child);
+    }
+
+    /**
+     * Removes all children (subelements and text content).
+     */
+    public void removeAllChildren() {
+        this.children.clear();
+    }
+
+    void addItemForMoving(Object item) {
+        if (itemsToMove == null) {
+            itemsToMove = new ArrayList<BaseToken>();
+        }
+        if (item instanceof BaseToken){
+            itemsToMove.add((BaseToken)item);        	
+        } else {
+        	throw new RuntimeException("Attempt to add invalid item for moving; class="+item.getClass());
+        }
+
+    }
+
+    List<? extends BaseToken> getItemsToMove() {
+        return itemsToMove;
+    }
+
+    void setItemsToMove(List<BaseToken> itemsToMove) {
+        this.itemsToMove = itemsToMove;
+    }
+
+    boolean isFormed() {
+        return isFormed;
+    }
+
+    void setFormed(boolean isFormed) {
+        this.isFormed = isFormed;
+    }
+
+    void setFormed() {
+        setFormed(true);
+    }
+
+    /**
+     * @param autoGenerated the autoGenerated to set
+     */
+    public void setAutoGenerated(boolean autoGenerated) {
+        this.autoGenerated = autoGenerated;
+    }
+
+    /**
+     * @return the autoGenerated
+     */
+    public boolean isAutoGenerated() {
+        return autoGenerated;
+    }
+
+    /**
+     * @return true, if node was marked to be pruned.
+     */
+    public boolean isPruned() {
+        return pruned;
+    }
+
+    public void setPruned(boolean pruned) {
+        this.pruned = pruned;
+    }
+
+    public boolean isEmpty() {
+        if (!isPruned()) {
+            for (Object child : this.children) {
+                if (child instanceof TagNode) {
+                    if (!((TagNode) child).isPruned()) {
+                        return false;
+                    }
+                } else if (child instanceof ContentNode) {
+                    if (!((ContentNode) child).isBlank()) {
+                        return false;
+                    }
+                } else if (child instanceof CommentNode) {
+                    // ideally could be discarded - however standard practice is to include browser specific commands in comments. :-(
+                    return false;
+                } else {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Adds namespace declaration to the node
+     *
+     * @param nsPrefix Namespace prefix
+     * @param nsURI Namespace URI
+     */
+    public void addNamespaceDeclaration(String nsPrefix, String nsURI) {
+        if (nsDeclarations == null) {
+            nsDeclarations = new TreeMap<String, String>();
+        }
+        nsDeclarations.put(nsPrefix, nsURI);
+    }
+
+    /**
+     * Collect all prefixes in namespace declarations up the path to the document root from the
+     * specified node
+     *
+     * @param prefixes Set of prefixes to be collected
+     */
+    void collectNamespacePrefixesOnPath(Set<String> prefixes) {
+        Map<String, String> nsDeclarations = getNamespaceDeclarations();
+        if (nsDeclarations != null) {
+            for (String prefix : nsDeclarations.keySet()) {
+                prefixes.add(prefix);
+            }
+        }
+        if (parent != null) {
+            parent.collectNamespacePrefixesOnPath(prefixes);
+        }
+    }
+
+    String getNamespaceURIOnPath(String nsPrefix) {
+        if (nsDeclarations != null) {
+            for (Map.Entry<String, String> nsEntry : nsDeclarations.entrySet()) {
+                String currName = nsEntry.getKey();
+                if (currName.equals(nsPrefix) || ("".equals(currName) && nsPrefix == null)) {
+                    return nsEntry.getValue();
+                }
+            }
+        }
+        if (parent != null) {
+            return parent.getNamespaceURIOnPath(nsPrefix);
+        }
+
+        return null;
+    }
+
+    /**
+     * @return Map of namespace declarations for this node
+     */
+    public Map<String, String> getNamespaceDeclarations() {
+        return nsDeclarations;
+    }
+
+    public void serialize(Serializer serializer, Writer writer) throws IOException {
+        serializer.serialize(this, writer);
+    }
+
+    public TagNode makeCopy() {
+        TagNode copy = new TagNode(name, true);
+        copy.attributes.putAll(attributes);
+        return copy;
+    }
+
+    public boolean isCopy() {
+        return isCopy;
+    }
+
+    /**
+     * Traverses the tree and performs visitor's action on each node. It stops when it finishes all
+     * the tree or when visitor returns false.
+     *
+     * @param visitor TagNodeVisitor implementation
+     */
+    public void traverse(TagNodeVisitor visitor) {
+        traverseInternally(visitor);
+    }
+
+    private boolean traverseInternally(TagNodeVisitor visitor) {
+        if (visitor != null) {
+            boolean hasParent = parent != null;
+            boolean toContinue = visitor.visit(parent, this);
+
+            if (!toContinue) {
+                return false; // if visitor stops traversal
+            } else if (hasParent && parent == null) {
+                return true; // if this node is pruned from the tree during the visit, then don't go deeper
+            }
+            for (Object child : children.toArray()) { // make an array to avoid ConcurrentModificationException when some node is cut
+                if (child instanceof TagNode) {
+                    toContinue = ((TagNode) child).traverseInternally(visitor);
+                } else if (child instanceof ContentNode) {
+                    toContinue = visitor.visit(this, (ContentNode) child);
+                } else if (child instanceof CommentNode) {
+                    toContinue = visitor.visit(this, (CommentNode) child);
+                }
+                if (!toContinue) {
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+
+	/**
+	 * @return the isForeignMarkup
+	 */
+	public boolean isForeignMarkup() {
+		return isForeignMarkup;
+	}
+
+	/**
+	 * @param isForeignMarkup the isForeignMarkup to set
+	 */
+	public void setForeignMarkup(boolean isForeignMarkup) {
+		foreignMarkupFlagSet = true;
+		this.isForeignMarkup = isForeignMarkup;
+		
+		//
+		// if set to false, change all existing attributes of this
+		// element to lowercase.
+		//
+		if (!isForeignMarkup){
+			this.replaceAttributes(getAttributesInLowerCase());
+		}
+	}
+
+	/**
+	 * @return the isTrimAttributeValues
+	 */
+	public boolean isTrimAttributeValues() {
+		return isTrimAttributeValues;
+	}
+
+	/**
+	 * @param isTrimAttributeValues the isTrimAttributeValues to set
+	 */ 
+	public void setTrimAttributeValues(boolean isTrimAttributeValues) {
+		this.isTrimAttributeValues = isTrimAttributeValues;
+	}
+	
+	/**
+	 * Returns a copy of the set of attributes for this node with lowercase
+	 * names. Where there are duplicate attributes (e.g. class, CLASS) the first
+	 * value is retained.
+	 * @return a map of attributes in key/value pairs with names in lowercase
+	 */
+	private Map<String, String> attributesToLowerCase(){
+		Map<String, String> lowerCaseAttributes = new LinkedHashMap<String, String>();
+		for (Entry<String, String> entry: attributes.entrySet()){
+			String key = entry.getKey();
+			if (!lowerCaseAttributes.containsKey(key.toLowerCase())){
+				lowerCaseAttributes.put(key.toLowerCase(), attributes.get(key));
+			}
+		}
+		return lowerCaseAttributes;
+	}
+	
+	/**
+	 * Called whenver the thread is interrupted. Currently this is a 
+	 * placeholder, but could hold cleanup methods and user interaction
+	 */
+	private void handleInterruption(){
+		
+	}
+
+}
@@ -0,0 +1,16 @@
+package org.htmlcleaner;
+
+/**
+ * Defines action to be performed on TagNodes
+ */
+public interface TagNodeVisitor {
+
+    /**
+     * Action to be performed on single node in the tree
+     * @param parentNode Parent of tagNode
+     * @param htmlNode node visited
+     * @return True if tree traversal should be continued, false if it has to stop.
+     */
+    public boolean visit(TagNode parentNode, HtmlNode htmlNode);
+
+}
@@ -0,0 +1,18 @@
+package org.htmlcleaner;
+
+/**
+ * Contains information about a single open tag
+ */
+
+class TagPos {
+	
+	int position;
+	String name;
+	TagInfo info;
+
+	TagPos(int position, String name, TagInfo tagInfo, CleanTimeValues cleanTimeValues) {
+		this.position = position;
+		this.name = name;
+		this.info = tagInfo;
+	}
+}
@@ -0,0 +1,66 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+
+/**
+ * <p>HTML tag token - descendants are start (TagNode) and end token (EndTagToken).</p>
+ */
+public abstract class TagToken extends BaseHtmlNode {
+
+    protected String name;
+
+	public TagToken() {
+	}
+
+	public TagToken(String name) {
+		this.name = name;
+	}
+
+	public String getName() {
+		return name;
+	}
+
+	@Override
+    public String toString() {
+		return name;
+	}
+
+    abstract void addAttribute(String attName, String attValue);
+
+}
@@ -0,0 +1,231 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.LinkedHashMap;
+import java.util.regex.Pattern;
+
+/**
+ * Describes how specified tag is transformed to another one, or is ignored during parsing
+ */
+public class TagTransformation {
+    public static String VAR_START = "${";
+    public static String VAR_END = "}";
+    private String sourceTag;
+    private String destTag;
+    private boolean preserveSourceAttributes;
+    private Map<String, String> attributeTransformations = new LinkedHashMap<String, String>();
+    private List<AttributeTransformation> attributePatternTransformations = new ArrayList<AttributeTransformation>();
+    public TagTransformation() {
+        this.preserveSourceAttributes = true;
+    }
+    /**
+     * Creates new tag transformation from source tag to target tag specifying whether
+     * source tag attributes are preserved.
+     * @param sourceTag Name of the tag to be transformed.
+     * @param destTag Name of tag to which source tag is to be transformed.
+     * @param preserveSourceAttributes Tells whether source tag attributes are preserved in transformation.
+     */
+    public TagTransformation(String sourceTag, String destTag, boolean preserveSourceAttributes) {
+        this.sourceTag = sourceTag.toLowerCase();
+        if (destTag == null) {
+            this.destTag = null;
+        } else {
+            this.destTag = Utils.isValidXmlIdentifier(destTag) ? destTag.toLowerCase() : sourceTag;
+        }
+        this.preserveSourceAttributes = preserveSourceAttributes;
+    }
+
+    /**
+     * Creates new tag transformation from source tag to target tag preserving
+     * all source tag attributes.
+     * @param sourceTag Name of the tag to be transformed.
+     * @param destTag Name of tag to which source tag is to be transformed.
+     */
+    public TagTransformation(String sourceTag, String destTag) {
+        this(sourceTag, destTag, true);
+    }
+
+    /**
+     * Creates new tag transformation in which specified tag will be skipped (ignored)
+     * during parsing process.
+     * @param sourceTag
+     */
+    public TagTransformation(String sourceTag) {
+        this(sourceTag, null);
+    }
+
+    /**
+     * Adds new attribute transformation to this tag transformation. It tells how destination
+     * attribute will look like. Small templating mechanism is used to describe attribute value:
+     * all names between ${ and } inside the template are evaluated against source tag attributes.
+     * That way one can make attribute values consist of mix of source tag attributes.
+     *
+     * @param targetAttName Name of the destination attribute 
+     * @param transformationDesc Template describing attribute value.
+     */
+    public void addAttributeTransformation(String targetAttName, String transformationDesc) {
+        attributeTransformations.put(targetAttName.toLowerCase(), transformationDesc);
+    }
+    public void addAttributePatternTransformation(Pattern attNamePattern, String transformationDesc) {
+        attributePatternTransformations.add(new AttributeTransformationPatternImpl(attNamePattern, null, transformationDesc));
+    }
+    public void addAttributePatternTransformation(Pattern attNamePattern, Pattern attValuePattern, String transformationDesc) {
+        addAttributePatternTransformation(new AttributeTransformationPatternImpl(attNamePattern, attValuePattern, transformationDesc));
+    }
+    /**
+     * @param attributeTransformation
+     */
+    public void addAttributePatternTransformation(AttributeTransformation attributeTransformation) {
+        if (attributePatternTransformations == null) {
+            attributePatternTransformations = new ArrayList<AttributeTransformation>();
+        }
+        attributePatternTransformations.add(attributeTransformation);
+    }
+    /**
+     * Adds new attribute transformation in which destination attrbute will not exists
+     * (simply removes it from list of attributes).
+     * @param targetAttName
+     */
+    public void addAttributeTransformation(String targetAttName) {
+        addAttributeTransformation(targetAttName, null);
+    }
+
+    boolean hasAttributeTransformations() {
+        return attributeTransformations != null || attributePatternTransformations != null;
+    }
+
+    String getSourceTag() {
+        return sourceTag;
+    }
+
+    String getDestTag() {
+        return destTag;
+    }
+
+    boolean isPreserveSourceAttributes() {
+        return preserveSourceAttributes;
+    }
+
+    Map<String, String> getAttributeTransformations() {
+        return attributeTransformations;
+    }
+    /**
+     * @param attributes
+     */
+    public Map<String, String> applyTagTransformations(Map<String, String> attributes) {
+        boolean isPreserveSourceAtts = isPreserveSourceAttributes();
+        boolean hasAttTransforms = hasAttributeTransformations();
+        if ( hasAttTransforms || !isPreserveSourceAtts) {
+            Map<String, String> newAttributes = isPreserveSourceAtts ? new LinkedHashMap<String, String>(attributes) : new LinkedHashMap<String, String>();
+            if (hasAttTransforms) {
+                Map<String, String> map = getAttributeTransformations();
+                Iterator<Map.Entry<String, String>> iterator = map.entrySet().iterator();
+                while (iterator.hasNext()) {
+                    Map.Entry<String, String> entry = iterator.next();
+                    String attName = (String) entry.getKey();
+                    String template = (String) entry.getValue();
+                    if (template == null) {
+                        newAttributes.remove(attName);
+                    } else {
+                        String attValue = evaluateTemplate(template, attributes);
+                        newAttributes.put(attName, attValue);
+                    }
+                }
+                
+                for(AttributeTransformation attributeTransformation: this.attributePatternTransformations) {
+                    for(Map.Entry<String, String> entry1: attributes.entrySet()) {
+                        String attName = entry1.getKey();
+                        if (attributeTransformation.satisfy(attName, entry1.getValue())) {
+                            String template = attributeTransformation.getTemplate();
+                            if (template == null) {
+                                newAttributes.remove(attName);
+                            } else {
+                                String attValue = evaluateTemplate(template, attributes);
+                                newAttributes.put(attName, attValue);
+                            }
+                        }
+                    }
+                }
+            }
+            return newAttributes;
+        } else {
+            return attributes;
+        }
+    }
+    /**
+     * Evaluates string template for specified map of variables. Template string can contain
+     * dynamic parts in the form of ${VARNAME}. Each such part is replaced with value of the
+     * variable if such exists in the map, or with empty string otherwise.
+     *
+     * @param template Template string
+     * @param variables Map of variables (can be null)
+     * @return Evaluated string
+     */
+    public String evaluateTemplate(String template, Map<String, String> variables) {
+        if (template == null) {
+            return template;
+        }
+
+        StringBuffer result = new StringBuffer();
+
+        int startIndex = template.indexOf(VAR_START);
+        int endIndex = -1;
+
+        while (startIndex >= 0 && startIndex < template.length()) {
+            result.append( template.substring(endIndex + 1, startIndex) );
+            endIndex = template.indexOf(VAR_END, startIndex);
+
+            if (endIndex > startIndex) {
+                String varName = template.substring(startIndex + VAR_START.length(), endIndex);
+                Object resultObj = variables != null ? variables.get(varName.toLowerCase()) : "";
+                result.append( resultObj == null ? "" : resultObj.toString() );
+            }
+
+            startIndex = template.indexOf( VAR_START, Math.max(endIndex + VAR_END.length(), startIndex + 1) );
+        }
+
+        result.append( template.substring(endIndex + 1) );
+
+        return result.toString();
+    }
+}
@@ -0,0 +1,123 @@
+/*  Copyright (c) 2006-2019, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+package org.htmlcleaner;
+
+import java.io.StringWriter;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.w3c.dom.Document;
+
+/**
+ * A traversal-based serializer for DOM; used to avoid recursion and stack overflow for large
+ * HTML documents.
+ */
+public class TraversalDomSerializer {
+
+	private CleanerProperties props;
+	
+    /**
+     * Whether XML entities should be escaped or not.
+     */
+    protected boolean escapeXml = true;
+    protected boolean deserializeCdataEntities = false;
+    protected boolean strictErrorChecking = true;
+    
+    /**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     * @param escapeXml if true then escape XML entities
+     * @param deserializeCdataEntities if true then deserialize entities in CData sections
+     * @param strictErrorChecking if false then Document strict error checking is turned off
+     */
+    public TraversalDomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities, boolean strictErrorChecking){
+        this.props = props;
+        this.escapeXml = escapeXml;
+        this.deserializeCdataEntities = deserializeCdataEntities;
+        this.strictErrorChecking = strictErrorChecking;
+    }
+
+    /**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     * @param escapeXml if true then escape XML entities
+     * @param deserializeCdataEntities if true then deserialize entities in CData sections
+     */
+    public TraversalDomSerializer(CleanerProperties props, boolean escapeXml, boolean deserializeCdataEntities) {
+        this.props = props;
+        this.escapeXml = escapeXml;
+        this.deserializeCdataEntities = deserializeCdataEntities;
+    }
+
+    /**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     * @param escapeXml if true then escape XML entities
+     */
+    public TraversalDomSerializer(CleanerProperties props, boolean escapeXml) {
+        this.props = props;
+        this.escapeXml = escapeXml;
+    }
+
+    /**
+     * @param props the HTML Cleaner properties set by the user to control the HTML cleaning.
+     */
+    public TraversalDomSerializer(CleanerProperties props) {
+        this.props = props;
+    }
+    
+    /**
+     * @param rootNode the HTML Cleaner root node to serialize
+     * @return the W3C Document object
+     * @throws ParserConfigurationException if there's an error during serialization
+     */
+    public Document createDOM(TagNode rootNode) throws ParserConfigurationException {
+    	DomBuilder builder = new DomBuilder(props, escapeXml, deserializeCdataEntities, strictErrorChecking);
+    	XmlTraversor.traverse(builder, rootNode);
+        return builder.getDocument();
+    }
+    
+    public static String toString(Document doc) throws TransformerException, ParserConfigurationException{
+    	DOMSource domSource = new DOMSource(doc);
+    	StringWriter writer = new StringWriter();
+    	StreamResult result = new StreamResult(writer);
+    	TransformerFactory tf = TransformerFactory.newInstance();
+    	Transformer transformer = tf.newTransformer();
+    	transformer.transform(domSource, result);
+    	return writer.toString();
+    }
+	
+}
@@ -0,0 +1,907 @@
+/*  Copyright (c) 2006-2019, the HtmlCleaner project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+package org.htmlcleaner;
+
+import java.io.*;
+import java.net.URL;
+import java.util.StringTokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * <p>Common utilities.</p>
+ *
+ * Created by: Vladimir Nikic<br/>
+ * Date: November, 2006.
+ */
+public class Utils {
+	
+	static final String VALID_XML_IDENTIFIER_START_CHAR_REGEX = "^[:A-Z_a-z\\u00C0\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02ff\\u0370-\\u037d"
+			+ "\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f\\u2c00-\\u2fef\\u3001-\\ud7ff"
+			+ "\\uf900-\\ufdcf\\ufdf0-\\ufffd\\x{10000}-\\x{EFFFF}]";
+	static final Pattern VALID_XML_IDENTIFIER_START_CHAR_PATTERN = 
+			compileUnicodePattern(VALID_XML_IDENTIFIER_START_CHAR_REGEX);
+	
+	/*
+		The relevant production from the spec is http://www.w3.org/TR/xml/#NT-Name
+		Name ::== NameStartChar NameChar *
+		NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+		NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+	 */
+	static final String VALID_XML_IDENTIFIER_CHAR_REGEX = 
+			  "^[:A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02ff\\u0370-\\u037d"
+			+ "\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f\\u2c00-\\u2fef\\u3001-\\ud7ff"
+			+ "\\uf900-\\ufdcf\\ufdf0-\\ufffd\\x{10000}-\\x{EFFFF}]"
+			+ "[:A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6"
+			+ "\\u00F8-\\u02ff\\u0370-\\u037d\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f"
+			+ "\\u2c00-\\u2fef\\u3001-\\udfff\\uf900-\\ufdcf\\ufdf0-\\ufffd\\-\\.0-9"
+			+ "\\u00b7\\u0300-\\u036f\\u203f-\\u2040]*\\Z";
+	static final Pattern VALID_XML_IDENTIFIER_CHAR_PATTERN = 
+			compileUnicodePattern(VALID_XML_IDENTIFIER_CHAR_REGEX);
+
+	
+	/**
+	 * Removes the first newline and last newline (if present) of a string
+	 * @param str
+	 * @return
+	 */
+	static String bchomp(final String str){
+		return chomp(lchomp(str));
+	}
+	
+	/**
+	 * Removes the last newline (if present) of a string
+	 * @param str
+	 * @return
+	 */
+	static String chomp(final String str){
+		if (str.length() ==0) {
+			return str;
+		}
+
+		if (str.length() == 1) {
+			final char ch = str.charAt(0);
+			if (ch == '\r' || ch == '\n') {
+				return "";
+			}
+			return str;
+		}
+
+		int lastIdx = str.length() - 1;
+		final char last = str.charAt(lastIdx);
+
+		if (last == '\n') {
+			if (str.charAt(lastIdx - 1) == '\r') {
+				lastIdx--;
+			}
+		} else if (last != '\r') {
+			lastIdx++;
+		}
+		return str.substring(0, lastIdx);
+	}
+	
+	/**
+	 * Removes the first newline (if present) of a string
+	 * @param str
+	 * @return
+	 */
+	static String lchomp(final String str){
+		if (str == null) return null;
+		if (str.length() == 0) {
+			return str;
+		}
+
+		if (str.length() == 1) {
+			final char ch = str.charAt(0);
+			if (ch == '\r' || ch == '\n') {
+				return "";
+			}
+			return str;
+		}
+		
+		int firstIndex = 0;
+		
+		final char first = str.charAt(0);
+		if (first == '\n'){
+			firstIndex++;
+			if (str.charAt(1) == '\r') {
+				firstIndex++ ;
+			}
+		} else if (first != '\r') {
+			firstIndex = 0;
+		}
+		return str.substring(firstIndex, str.length());
+	}
+	
+
+    /**
+     * Reads content from the specified URL with specified charset into string
+     * @param url
+     * @param charset
+     * @throws IOException
+     */
+    @Deprecated // Removing network I/O will make htmlcleaner better suited to a server environment which needs managed connections
+    static CharSequence readUrl(URL url, String charset) throws IOException {
+        StringBuilder buffer = new StringBuilder(1024);
+        InputStream inputStream = url.openStream();
+        try {
+            InputStreamReader reader = new InputStreamReader(inputStream, charset);
+            char[] charArray = new char[1024];
+
+            int charsRead = 0;
+            do {
+                charsRead = reader.read(charArray);
+                if (charsRead >= 0) {
+                    buffer.append(charArray, 0, charsRead);
+                }
+            } while (charsRead > 0);
+        } finally {
+            inputStream.close();
+        }
+
+        return buffer;
+    }
+    
+    /**
+     * Checks if specified link is full URL.
+     *
+     * @param link
+     * @return True, if full URl, false otherwise.
+     */
+    public static boolean isFullUrl(String link) {
+        if (link == null) {
+            return false;
+        }
+        link = link.trim().toLowerCase();
+        return link.startsWith("http://") || link.startsWith("https://") || link.startsWith("file://");
+    }
+    
+    /**
+     * Calculates full URL for specified page URL and link
+     * which could be full, absolute or relative like there can
+     * be found in A or IMG tags. (Reinstated as per user request in bug 159)
+     */
+    public static String fullUrl(String pageUrl, String link) {
+        if (isFullUrl(link)) {
+            return link;
+        } else if (link != null && link.startsWith("?")) {
+            int qindex = pageUrl.indexOf('?');
+            int len = pageUrl.length();
+            if (qindex < 0) {
+                return pageUrl + link;
+            } else if (qindex == len - 1) {
+                return pageUrl.substring(0, len - 1) + link;
+            } else {
+                return pageUrl + "&" + link.substring(1);
+            }
+        }
+
+        boolean isLinkAbsolute = link.startsWith("/");
+
+        if (!isFullUrl(pageUrl)) {
+            pageUrl = "http://" + pageUrl;
+        }
+
+        int slashIndex = isLinkAbsolute ? pageUrl.indexOf("/", 8) : pageUrl.lastIndexOf("/");
+        if (slashIndex <= 8) {
+            pageUrl += "/";
+        } else {
+            pageUrl = pageUrl.substring(0, slashIndex + 1);
+        }
+
+        return isLinkAbsolute ? pageUrl + link.substring(1) : pageUrl + link;
+    }
+
+    /**
+     * Escapes HTML string
+     * @param s String to be escaped
+     * @param props Cleaner properties affects escaping behaviour
+     * @return the escaped string
+     */
+    public static String escapeHtml(String s, CleanerProperties props) {
+        boolean advanced = props.isAdvancedXmlEscape();
+        boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
+        boolean translateSpecialEntities = props.isTranslateSpecialEntities();
+        boolean transResCharsToNCR = props.isTransResCharsToNCR();
+        boolean transSpecialEntitiesToNCR = props.isTransSpecialEntitiesToNCR();
+        return escapeXml(s, advanced, recognizeUnicodeChars, translateSpecialEntities, false, transResCharsToNCR, transSpecialEntitiesToNCR, true);    	
+    }
+
+    /**
+     * Escapes XML string.
+     * @param s String to be escaped
+     * @param props Cleaner properties affects escaping behaviour
+     * @param isDomCreation Tells if escaped content will be part of the DOM
+	 * @return the escaped string
+     */
+    public static String escapeXml(String s, CleanerProperties props, boolean isDomCreation) {
+        boolean advanced = props.isAdvancedXmlEscape();
+        boolean recognizeUnicodeChars = props.isRecognizeUnicodeChars();
+        boolean translateSpecialEntities = props.isTranslateSpecialEntities();
+        boolean transResCharsToNCR = props.isTransResCharsToNCR();
+        boolean transSpecialEntitiesToNCR = props.isTransSpecialEntitiesToNCR();
+        return escapeXml(s, advanced, recognizeUnicodeChars, translateSpecialEntities, isDomCreation, transResCharsToNCR, transSpecialEntitiesToNCR, false);
+    }
+    
+    /**
+     * change notes:
+     * 1) convert ascii characters encoded using &#xx; format to the ascii characters -- may be an attempt to slip in malicious html
+     * 2) convert &#xxx; format characters to &quot; style representation if available for the character.
+     * 3) convert html special entities to xml &#xxx; when outputing in xml
+     * @param s the string to escape
+     * @param advanced whether to use Advanced XML Escaping
+     * @param recognizeUnicodeChars whether to recognise and replace Unicode characters
+     * @param translateSpecialEntities whether to translate special entities
+     * @param isDomCreation whether the escaping is in the context of DomCreation, an internal operation, with special rules.
+     * @return the escaped string
+     * TODO Consider moving to CleanerProperties since a long list of params is misleading.
+     */
+    public static String escapeXml(String s, boolean advanced, boolean recognizeUnicodeChars, boolean translateSpecialEntities, 
+                                   boolean isDomCreation, boolean transResCharsToNCR, boolean translateSpecialEntitiesToNCR) {
+    	return escapeXml(s,advanced,recognizeUnicodeChars,translateSpecialEntities,isDomCreation,transResCharsToNCR,translateSpecialEntitiesToNCR,false);
+    }
+    
+    /**
+     * change notes:
+     * 1) convert ascii characters encoded using &#xx; format to the ascii characters -- may be an attempt to slip in malicious html
+     * 2) convert &#xxx; format characters to &quot; style representation if available for the character.
+     * 3) convert html special entities to xml &#xxx; when outputing in xml
+     * @param s the string to escape
+     * @param advanced whether to use Advanced XML Escaping
+     * @param recognizeUnicodeChars whether to recognise and replace Unicode characters
+     * @param translateSpecialEntities whether to translate special entities
+     * @param isDomCreation whether the escaping is in the context of DomCreation, an internal operation, with special rules.
+     * @param isHtmlOutput whether the output is intended to be treated as HTML
+     * @return
+     * TODO Consider moving to CleanerProperties since a long list of params is misleading.
+     */
+    public static String escapeXml(String s, boolean advanced, boolean recognizeUnicodeChars, boolean translateSpecialEntities, 
+                                   boolean isDomCreation, boolean transResCharsToNCR, boolean translateSpecialEntitiesToNCR, boolean isHtmlOutput) {
+        if (s != null) {
+    		int len = s.length();
+    		StringBuilder result = new StringBuilder(len);
+
+    		for (int i = 0; i < len; i++) {
+    			char ch = s.charAt(i);
+
+    			SpecialEntity code;
+    			if (ch == '&') {
+    				if ( (advanced || recognizeUnicodeChars) && (i < len-1) && (s.charAt(i+1) == '#') ) {
+
+    					i = convertToUnicode(s, isDomCreation, recognizeUnicodeChars, translateSpecialEntitiesToNCR, result, i+2);
+    				} else if ((translateSpecialEntities || advanced) &&
+				        (code = SpecialEntities.INSTANCE.getSpecialEntity(s.substring(i, i+Math.min(10, len-i)))) != null) {
+			            if (translateSpecialEntities && code.isHtmlSpecialEntity()) {
+                            if (recognizeUnicodeChars) {
+                                result.append( (char)code.intValue() );
+                            } else {
+                                result.append( code.getDecimalNCR() );
+                            }
+							i += code.getKey().length() + 1;
+    				    } else if (advanced ) {
+    				    	//
+    				    	// If we are creating a HTML DOM or outputting to the HtmlSerializer, use HTML special entities;
+    				    	// otherwise we get their XML escaped version (see bug #118).
+    				    	//
+					        result.append(transResCharsToNCR ? code.getDecimalNCR() : code.getEscaped(isHtmlOutput || isDomCreation));
+		                    i += code.getKey().length()+1;
+			            } else {
+			                result.append(transResCharsToNCR ? getAmpNcr() : "&amp;");
+			            }
+					} 
+
+			        //
+			        // If the serializer used to output is HTML rather than XML, and we have a match to a
+			        // known HTML entity such as &nbsp;, we output it as-is (see bug #118)
+			        //
+
+					else if (isHtmlOutput)
+					{
+						// we have an ampersand and that's all we know so far
+					
+						code = SpecialEntities.INSTANCE.getSpecialEntity(s.substring(i, i+Math.min(10, len-i)));
+					
+						if ( code != null )
+						{
+							// It is a special entity like &nbsp; - leave it in place.
+					
+							result.append(code.getEscapedValue());
+					
+							// advance i by the length of the entity so we won't process each following character
+							// key length excludes & and ; and we add 1 to skip the ;
+							i += code.getKey().length()+1;
+						}
+						else if ( (i < len-1) && (s.charAt(i+1) == '#') )
+						{
+							// if the next char is a # then convert entity number to entity name (if possible)
+					
+							i = convert_To_Entity_Name(s, false, false, false, result, i+2);
+					
+							// assuming 'i' is being incremented correctly... not verified.
+						}
+						else
+						{
+							// html output but not an entity name or number
+					
+							result.append(transResCharsToNCR ? getAmpNcr() : "&amp;");
+						}
+					} else {
+    				    result.append(transResCharsToNCR ? getAmpNcr() : "&amp;");
+    				}
+    			} else if ((code = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(ch)) != null ) {
+
+					// It's a special entity character itself
+					
+					if ( isHtmlOutput )
+					{
+						if ( "apos".equals(code.getKey()) )
+						{
+							// leave the apostrophes alone for html output
+							// this is a cheap hack to avoid removing apostrophe from the special entities list for html output
+							result.append(ch);
+						}
+						else
+						{
+							// output as entity name, or as literal character if isDomCreation
+							result.append(isDomCreation? code.getHtmlString() : code.getEscapedValue());
+						}
+					}
+					else
+					{
+						// if we have one of the XML reserved characters, get escaped version, otherwise, 
+						// output the literal characters.
+				    	if (isDomCreation && !isXmlReservedCharacter(String.valueOf(ch))){
+				    		result.append(ch);
+				    	} else {
+							// output as entity number, or as literal character if isDomCreation
+							result.append(transResCharsToNCR ? code.getDecimalNCR() : code.getEscaped(isDomCreation));
+				    	}
+					}
+
+    			} else {
+    				result.append(ch);
+    			}
+    		}
+
+    		return result.toString();
+    	}
+
+    	return null;
+    }
+
+    private static String ampNcr;
+
+    private static String getAmpNcr() {
+        if (ampNcr == null) {
+            ampNcr = SpecialEntities.INSTANCE.getSpecialEntityByUnicode('&').getDecimalNCR();
+        }
+
+        return ampNcr;
+    }
+
+    private static final Pattern ASCII_CHAR = Pattern.compile("\\p{Print}");
+
+    /**
+     * @param s
+     * @param domCreation
+     * @param recognizeUnicodeChars
+     * @param translateSpecialEntitiesToNCR 
+     * @param result
+     * @param i
+     * @return
+     */
+
+	// Converts Numeric Character References (NCRs) (Dec or Hex) to Character Entity References
+	// ie. &#8364;	to &euro; 
+	// This is almost a copy of convertToUnicode
+	// only called in the case of isHtmlOutput when we see &# in the input stream
+
+    private static int convert_To_Entity_Name(String s, boolean domCreation, boolean recognizeUnicodeChars, boolean translateSpecialEntitiesToNCR, StringBuilder result, int i) {
+        StringBuilder unicode = new StringBuilder();
+        int charIndex = extractCharCode(s, i, true, unicode);
+        if (unicode.length() > 0) {
+        	try {
+        	    boolean isHex = unicode.substring(0,1).equals("x");
+        	    
+        	    //
+        	    // Get the unicode character and code point
+        	    //
+        	    int codePoint = -1;
+        	    char[] unicodeChar = null;    
+        	    if (isHex){
+        	    	codePoint = Integer.parseInt(unicode.substring(1), 16);
+        	    	unicodeChar = Character.toChars(codePoint);
+        	    } else {
+        	    	codePoint = Integer.parseInt(unicode.toString());
+        	    	unicodeChar =  Character.toChars(codePoint);
+        	    }
+        	    
+        	    SpecialEntity specialEntity = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(codePoint);
+                if (unicodeChar.length == 1 && unicodeChar[0] == 0) {
+                    // null character &#0Peanut for example
+                    // just consume character &
+                    result.append("&amp;");
+                } 
+				else if ( specialEntity != null )
+				{
+					if ( specialEntity.isHtmlSpecialEntity() )
+					{
+						result.append( domCreation? specialEntity.getHtmlString() : specialEntity.getEscapedValue() );
+					}
+					else
+					{
+						result.append(domCreation? specialEntity.getHtmlString():
+						(translateSpecialEntitiesToNCR? (isHex? specialEntity.getHexNCR(): specialEntity.getDecimalNCR()) : 
+						specialEntity.getHtmlString()));
+					}
+                } else if ( recognizeUnicodeChars ) {
+                    // output unicode characters as their actual byte code with the exception of characters that have special xml meaning.
+                    result.append( String.valueOf(unicodeChar));
+                } else if ( ASCII_CHAR.matcher(new String(unicodeChar)).find()) {
+                    // ascii printable character. this fancy escaping might be an attempt to slip in dangerous characters (i.e. spelling out <script> )
+                    // by converting to printable characters we can more easily detect such attacks.
+                    result.append(String.valueOf(unicodeChar));
+                } else {
+                    // unknown unicode value - output as-is
+        			result.append( "&#").append(unicode).append(";" );
+        		}
+        	} catch (NumberFormatException e) {
+        	    // should never happen now
+        		result.append("&amp;#").append(unicode).append(";" );
+        	}
+        } else {
+        	result.append("&amp;");
+        }
+        return charIndex;
+    }
+
+
+    /**
+     * @param s
+     * @param domCreation
+     * @param recognizeUnicodeChars
+     * @param translateSpecialEntitiesToNCR 
+     * @param result
+     * @param i
+     * @return
+     */
+    private static int convertToUnicode(String s, boolean domCreation, boolean recognizeUnicodeChars, boolean translateSpecialEntitiesToNCR, StringBuilder result, int i) {
+        StringBuilder unicode = new StringBuilder();
+        int charIndex = extractCharCode(s, i, true, unicode);
+        if (unicode.length() > 0) {
+        	try {
+        	    boolean isHex = unicode.substring(0,1).equals("x");
+        	    
+        	    //
+        	    // Get the unicode character and code point
+        	    //
+        	    int codePoint = -1;
+        	    char[] unicodeChar = null;    
+        	    if (isHex){
+        	    	codePoint = Integer.parseInt(unicode.substring(1), 16);
+        	    } else {
+        	    	codePoint = Integer.parseInt(unicode.toString());
+        	    }
+    	    	
+        	    unicodeChar = Character.toChars(codePoint);
+        	    
+        	    SpecialEntity specialEntity = SpecialEntities.INSTANCE.getSpecialEntityByUnicode(codePoint);
+                if (unicodeChar.length == 1 && unicodeChar[0] == 0) {
+                    // null character &#0Peanut for example
+                    // just consume character &
+                    result.append("&amp;");
+                } else if ( specialEntity != null &&
+                        // special characters that are always escaped.
+                        (!specialEntity.isHtmlSpecialEntity()
+                                // OR we are not outputting unicode characters as the characters ( they are staying escaped )
+                                || !recognizeUnicodeChars)) {
+                    result.append(domCreation? specialEntity.getHtmlString():
+                        (translateSpecialEntitiesToNCR? (isHex? specialEntity.getHexNCR(): specialEntity.getDecimalNCR()) : 
+                            specialEntity.getEscapedXmlString()));
+                } else if ( recognizeUnicodeChars ) {
+                    // output unicode characters as their actual byte code with the exception of characters that have special xml meaning.
+                    result.append( String.valueOf(unicodeChar));
+                } else if ( ASCII_CHAR.matcher(new String(unicodeChar)).find()) {
+                    // ascii printable character. this fancy escaping might be an attempt to slip in dangerous characters (i.e. spelling out <script> )
+                    // by converting to printable characters we can more easily detect such attacks.
+                    result.append(String.valueOf(unicodeChar));
+                } else {
+        			result.append( "&#").append(unicode).append(";" );
+        		}
+        	} catch (NumberFormatException e) {
+        	    // should never happen now
+        		result.append("&amp;#").append(unicode).append(";" );
+        	}
+        	catch (IllegalArgumentException e) {
+        	    // code point is not a legal unicode character
+        		result.append("&amp;#").append(unicode).append(";" );
+        	}
+        } else {
+        	result.append("&amp;");
+        }
+        return charIndex;
+    }
+
+    // TODO have pattern consume leading 0's and discard.
+    public static Pattern HEX_STRICT = Pattern.compile("^([x|X][\\p{XDigit}]+)(;?)");
+    public static Pattern HEX_RELAXED = Pattern.compile("^0*([x|X][\\p{XDigit}]+)(;?)");
+    public static Pattern DECIMAL = Pattern.compile("^([\\p{Digit}]+)(;?)");
+    /**
+     * <ul>
+     * <li>(earlier code was failing on this) - &#138A; is converted by FF to 3 characters: &#138; + 'A' + ';'</li>
+     * <li>&#0x138A; is converted by FF to 6? 7? characters: &#0 'x'+'1'+'3'+ '8' + 'A' + ';'
+     * #0 is displayed kind of weird</li>
+     * <li>&#x138A; is a single character</li>
+     * </ul>
+     *
+     * @param s
+     * @param charIndex
+     * @param relaxedUnicode '&#0x138;' is treated like '&#x138;'
+     * @param unicode
+     * @return the index to continue scanning the source string -1 so normal loop incrementing skips the ';'
+     */
+    private static int extractCharCode(String s, int charIndex, boolean relaxedUnicode, StringBuilder unicode) {
+        int len = s.length();
+        CharSequence subSequence = s.subSequence(charIndex, Math.min(len,charIndex+15));
+        Matcher matcher;
+        if( relaxedUnicode ) {
+            matcher = HEX_RELAXED.matcher(subSequence);
+        } else {
+            matcher = HEX_STRICT.matcher(subSequence);
+        }
+        // silly note: remember calling find() twice finds second match :-)
+        if (matcher.find() || ((matcher = DECIMAL.matcher(subSequence)).find())) {
+            // -1 so normal loop incrementing skips the ';'
+            charIndex += matcher.end() -1;
+            unicode.append(matcher.group(1));
+        }
+        return charIndex;
+    }
+    
+    public static String sanitizeXmlIdentifier(String attName){
+    	return sanitizeXmlIdentifier(attName, "hc-generated-","");
+    }
+    
+    public static String sanitizeXmlIdentifier(String attName, String prefix){
+    	return sanitizeXmlIdentifier(attName, prefix,"");
+    }
+
+    public static String sanitizeHtmlAttributeName(String name){
+    	// Attribute names must consist of one or more characters other than controls, 
+    	// U+0020 SPACE, U+0022 ("), U+0027 ('), U+003E (>), U+002F (/), U+003D (=), and noncharacters.
+    	String regex = "[\\u0000\\u0020\\u0022\\u0027\\u003E\\u002F\\u003d]";
+    	Pattern pattern = compileUnicodePattern(regex);
+    	final Matcher matcher = pattern.matcher(name);
+    	name = matcher.replaceAll("");
+    	return name;
+    }
+    
+    public static boolean isValidHtmlAttributeName(String name){
+    	String regex = "^[^\\u0000\\u0020\\u0022\\u0027\\u003E\\u002F\\u003d]+$";
+    	Pattern pattern = compileUnicodePattern(regex);
+    	final Matcher matcher = pattern.matcher(name);
+    	return matcher.find();
+    }
+    
+    /**
+     * Attempts to replace invalid attribute names with valid ones.
+     * @param attName the attribute name to fix
+     * @param prefix the prefix to use to indicate an attribute name has been altered
+     * @return either the original attribute name if valid, or a generated identifier if not
+     */
+    public static String sanitizeXmlIdentifier(String attName, String prefix, String replacementCharacter){
+    	if (Utils.isValidXmlIdentifier(attName)) return attName;
+
+    	//
+    	// Prepend with "hc-generated-" or similar prefix. Useful for
+    	// identifiers that are valid apart from the start character, e.g "1a"
+    	//
+    	if (!Utils.isValidXmlIdentifierStartChar(attName.substring(0,1))){
+    		if (!prefix.isEmpty()){
+    			String generatedAttName = prefix + attName;
+    			if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
+    		} else {
+    			//
+    			// If not, strip out first character
+    			//
+    			String generatedAttName = attName.substring(1);
+    			if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
+    		}
+    	}
+
+    	//
+    	// otherwise, replace or strip out invalid characters
+    	//
+    	String generatedAttName = Utils.replaceInvalidXmlIdentifierCharacters(attName,"");
+    	if (Utils.isValidXmlIdentifier(generatedAttName)) return generatedAttName;
+    	
+    	//
+    	// If we still have something invalid - for example none of the characters in
+    	// it are valid - then return null
+    	//
+    	return null;
+    }
+
+    /**
+     * Checks whether specified string can be valid tag name or attribute name in xml.
+     * @param s String to be checked
+     * @return True if string is valid xml identifier, false otherwise
+     */
+    public static boolean isValidXmlIdentifier(String s) {
+    	if (s == null) return false;
+    	Matcher matcher = VALID_XML_IDENTIFIER_CHAR_PATTERN.matcher(s);
+		if (matcher.find()){
+			s = null;
+			matcher = null;
+			return true;
+		}
+		return false;
+    }
+
+    /**
+     * @param o
+     * @return True if specified string is null of contains only whitespace characters
+     */
+    public static boolean isEmptyString(Object o) {
+        if ( o == null ) {
+            return true;
+        }
+        String s = o.toString();
+        String text = escapeXml(s, true, false, false, false, false, false, false);
+        // TODO: doesn't escapeXml handle this?
+        String last = text.replace(SpecialEntities.NON_BREAKABLE_SPACE, ' ').trim();
+        return last.length() == 0;
+    }
+
+    public static String[] tokenize(String s, String delimiters) {
+        if (s == null) {
+            return new String[] {};
+        }
+
+        StringTokenizer tokenizer = new StringTokenizer(s, delimiters);
+        String result[] = new String[tokenizer.countTokens()];
+        int index = 0;
+        while (tokenizer.hasMoreTokens()) {
+            result[index++] = tokenizer.nextToken();
+        }
+
+        return result;
+    }
+    
+    public static boolean isXmlReservedCharacter(String c){
+    	final String XML_CHARS="'\"<>&";
+    	return XML_CHARS.contains(c);
+    }
+
+    /**
+     * @param name
+     * @return For xml element name or attribute name returns prefix (part before :) or null if there is no prefix
+     */
+    public static String getXmlNSPrefix(String name) {
+        int colIndex = name.indexOf(':');
+        if (colIndex > 0) {
+            return name.substring(0, colIndex);
+        }
+
+        return null;
+    }
+
+    /**
+     * @param name
+     * @return For xml element name or attribute name returns name after prefix (part after :)
+     */
+    public static String getXmlName(String name) {
+        int colIndex = name.indexOf(':');
+        if (colIndex > 0 && colIndex < name.length() - 1) {
+            return name.substring(colIndex + 1);
+        }
+
+        return name;
+    }
+    
+    static boolean isValidInt(String s, int radix) {
+        try {
+            Integer.parseInt(s, radix);
+            return true;
+        } catch (NumberFormatException e) {
+            return false;
+        }
+    }
+    
+    /**
+     * Trims specified string from left.
+     * @param s
+     */
+    public static String ltrim(String s) {
+        if (s == null) {
+            return null;
+        }
+
+        int index = 0;
+        int len = s.length();
+
+        while ( index < len && Character.isWhitespace(s.charAt(index)) ) {
+            index++;
+        }
+
+        return (index >= len) ? "" : s.substring(index);
+    }
+
+    /**
+     * Trims specified string from right.
+     * @param s
+     */
+    public static String rtrim(String s) {
+        if (s == null) {
+            return null;
+        }
+
+        int len = s.length();
+        int index = len;
+
+        while ( index > 0 && Character.isWhitespace(s.charAt(index-1)) ) {
+            index--;
+        }
+
+        return (index <= 0) ? "" : s.substring(0, index);
+    }
+    
+    /**
+     * Checks whether specified object's string representation is empty string (containing of only whitespaces).
+     * @param object Object whose string representation is checked
+     * @return true, if empty string, false otherwise
+     */
+    public static boolean isWhitespaceString(Object object) {
+        if (object != null) {
+            String s = object.toString();
+            return s != null && "".equals(s.trim());
+        }
+        return false;
+    }
+    
+    //
+    // Replaces entities with actual characters
+    //
+    public static String deserializeEntities(String str, boolean recognizeUnicodeChars) {
+    	StringBuffer buf = new StringBuffer(str);
+        SpecialEntities entities = SpecialEntities.INSTANCE;
+        int entityStart = -1;
+        boolean numericEntity = false;
+        boolean hexEntity = false;
+        int maxEntityLength = entities.getMaxEntityLength();
+        int i = 0;
+        int length = buf.length();
+        while (i < length) {
+            if (buf.charAt(i) == '&') {
+                entityStart = i;
+                numericEntity = false;
+                hexEntity = false;
+                ++i;
+            } else if (entityStart != -1) {
+                if (buf.charAt(i) == ';') {
+                    int entityValue = -1;
+                    if (numericEntity) {
+                        try {
+                            entityValue = Integer.parseInt(
+                            		buf.substring(
+                                            entityStart + (hexEntity ? 3 : 2),
+                                            i
+                                    ),
+                                    hexEntity ? 16 : 10
+                            );
+                        } catch (NumberFormatException e) {
+                            entityValue = -1;
+                        }
+                        
+                    	SpecialEntity entity = entities.getSpecialEntityByUnicode(entityValue);
+                    	if(entity != null)
+                    		entityValue = entity.intValue();
+                    	else if(!recognizeUnicodeChars)
+                    		entityValue = -1;
+                    } else {
+                    	SpecialEntity entity = entities.getSpecialEntity(buf.substring(entityStart + 1, i));
+                    	if(entity != null)
+                    		entityValue = entity.intValue();
+                    }
+                    
+                    if (entityValue >= 0) {
+                        char[] decodedEntity = Character.toChars(entityValue);
+                        buf.replace(entityStart, i + 1, new String(decodedEntity));
+                        length = buf.length();
+                        i = entityStart + decodedEntity.length;
+                    } else {
+                        ++i;
+                    }
+                    entityStart = -1;
+                } else {
+                    if (i == entityStart + 1 && buf.charAt(i) == '#') {
+                        numericEntity = true;
+                    } else if (i == entityStart + 2 && numericEntity && buf.charAt(i) == 'x') {
+                        hexEntity = true;
+                    } else if (i - entityStart > maxEntityLength) {
+                        entityStart = -1;
+                    }
+                    ++i;
+                }
+            } else {
+                ++i;
+            }
+        }
+        return buf.toString();
+    }
+    
+    /**
+     * Determines whether the initial character of an identifier is valid for XML
+     * @param identifier the identifier to check
+     * @return true is the intial character is valid
+     */
+    public static boolean isValidXmlIdentifierStartChar(String identifier){
+    	final Matcher matcher = VALID_XML_IDENTIFIER_START_CHAR_PATTERN.matcher(identifier);
+    	return matcher.find();
+    }
+
+    /**
+     * Strips out invalid characters from names used for XML Elements and replaces them with the specified
+     * character.
+     * 
+     * For example, "<p%>" becomes "<p_>"
+     * @param name
+     * @return valid XML name
+     */
+    public static String replaceInvalidXmlIdentifierCharacters(String name, String replacement){
+    	final String regex_repl = ""
+    			+ "[^:A-Z_a-z\\u00C0\\u00D6\\u00D8-\\u00F6"
+    			+ "\\u00F8-\\u02ff\\u0370-\\u037d\\u037f-\\u1fff\\u200c\\u200d\\u2070-\\u218f"
+    			+ "\\u2c00-\\u2fef\\u3001-\\udfff\\uf900-\\ufdcf\\ufdf0-\\ufffd\\-\\.0-9"
+    			+ "\\u00b7\\u0300-\\u036f\\u203f-\\u2040]";
+    	final Pattern pattern = compileUnicodePattern(regex_repl);
+    	final Matcher matcher = pattern.matcher(name);
+    	name = matcher.replaceAll(replacement);
+
+    	return name;
+    }
+    
+    
+    private static Pattern compileUnicodePattern(String pattern){
+    	try {
+    		return Pattern.compile(pattern, Pattern.UNICODE_CHARACTER_CLASS);
+    	} catch(IllegalArgumentException ex) {
+    		return Pattern.compile(pattern);
+    	}
+    	
+    }
+
+
+    
+}
@@ -0,0 +1,612 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.StringTokenizer;
+
+/**
+ * <p>Utility for searching cleaned document tree with XPath expressions.</p>
+ * Examples of supported axes:
+ * <code>
+ * <ul>
+ *      <li>//div//a</li>
+ *      <li>//div//a[@id][@class]</li>
+ *      <li>/body/*[1]/@type</li>
+ *      <li>//div[3]//a[@id][@href='r/n4']</li>
+ *      <li>//div[last() >= 4]//./div[position() = last()])[position() > 22]//li[2]//a</li>
+ *      <li>//div[2]/@*[2]</li>
+ *      <li>data(//div//a[@id][@class])</li>
+ *      <li>//p/last()</li>
+ *      <li>//body//div[3][@class]//span[12.2<position()]/@id</li>
+ *      <li>data(//a['v' < @id])</li>
+ * </ul>
+ * </code>
+ */
+public class XPather {
+
+	private static final int C0 = '0';
+	private static final int C9 = '9';
+	private static final int CD = '.';
+	private static final int CP = '+';
+	private static final int CM = '-';
+	private static final int CS = ' ';
+	
+    // array of basic tokens of which XPath expression is made
+    private String tokenArray[];
+
+    /**
+     * Constructor - creates XPather instance with specified XPath expression.
+     * @param expression
+     */
+    public XPather(String expression) {
+        StringTokenizer tokenizer = new StringTokenizer(expression, "/()[]\"'=<>", true);
+        int tokenCount = tokenizer.countTokens();
+        tokenArray = new String[tokenCount];
+
+        int index = 0;
+
+        // this is not real XPath compiler, rather simple way to recognize basic XPaths expressions
+        // and interpret them against some TagNode instance.
+        while (tokenizer.hasMoreTokens()) {
+            tokenArray[index++] = tokenizer.nextToken();
+        }
+    }
+
+    /**
+     * Main public method for this class - a way to execute XPath expression against
+     * specified TagNode instance.
+     * @param node
+     */
+    public Object[] evaluateAgainstNode(TagNode node) throws XPatherException {
+        if (node == null) {
+            throw new XPatherException("Cannot evaluate XPath expression against null value!");
+        }
+
+        Collection collectionResult = evaluateAgainst(singleton(node), 0, tokenArray.length - 1, false, 1, 0, false, null);
+        Object[] array = new Object[collectionResult.size()];
+
+        Iterator iterator = collectionResult.iterator();
+        int index = 0;
+        while (iterator.hasNext()) {
+            array[index++] = iterator.next();
+        }
+
+        return array;
+    }
+
+    private void throwStandardException() throws XPatherException {
+        throw new XPatherException();
+    }
+
+    protected Collection evaluateAgainst(Collection object,
+                                       int from,
+                                       int to,
+                                       boolean isRecursive,
+                                       int position,
+                                       int last,
+                                       boolean isFilterContext,
+                                       Collection filterSource) throws XPatherException {
+        if (from >= 0 && to < tokenArray.length && from <= to) {
+            if ("".equals(tokenArray[from].trim())) {
+                return evaluateAgainst(object, from + 1, to, isRecursive, position, last, isFilterContext, filterSource);
+            } else if (isToken("(", from)) {
+                int closingBracket = findClosingIndex(from, to);
+                if (closingBracket > 0) {
+                    Collection value = evaluateAgainst(object, from + 1, closingBracket - 1, false, position, last, isFilterContext, filterSource);
+                    return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, filterSource);
+                } else {
+                    throwStandardException();
+                }
+            } else if (isToken("[", from)) {
+                int closingBracket = findClosingIndex(from, to);
+                if (closingBracket > 0 && object != null) {
+                    Collection value = filterByCondition(object, from + 1, closingBracket - 1);
+                    return evaluateAgainst(value, closingBracket + 1, to, false, position, last, isFilterContext, filterSource);
+                } else {
+                    throwStandardException();
+                }
+            } else if (isToken("\"", from) || isToken("'", from)) { // string constant
+                int closingQuote = findClosingIndex(from, to);
+                if (closingQuote > from) {
+                    Collection value = singleton( flatten(from + 1, closingQuote - 1) );
+                    return evaluateAgainst(value, closingQuote + 1, to, false, position, last, isFilterContext, filterSource);
+                } else {
+                    throwStandardException();
+                }
+            } else if ( (isToken("=", from) || isToken("<", from) || isToken(">", from)) && isFilterContext ) {     // operator inside filter
+                boolean logicValue;
+                if ( isToken("=", from + 1) && (isToken("<", from) || isToken(">", from)) ) {
+                    Collection secondObject = evaluateAgainst(filterSource, from + 2, to, false, position, last, isFilterContext, filterSource);
+                    logicValue = evaluateLogic(object, secondObject, tokenArray[from] + tokenArray[from + 1]);
+                } else {
+                    Collection secondObject = evaluateAgainst(filterSource, from + 1, to, false, position, last, isFilterContext, filterSource);
+                    logicValue = evaluateLogic(object, secondObject, tokenArray[from]);
+                }
+                return singleton(new Boolean(logicValue));
+            } else if (isToken("/", from)) {    // children of the node
+                boolean goRecursive = isToken("/", from + 1);
+                if (goRecursive) {
+                    from++;
+                }
+                if ( from < to ) {
+                    int toIndex = findClosingIndex(from, to) - 1;
+                    if (toIndex <= from) {
+                        toIndex = to;
+                    }
+                    Collection value = evaluateAgainst(object, from + 1, toIndex, goRecursive, 1, last, isFilterContext, filterSource);
+                    return evaluateAgainst(value, toIndex + 1, to, false, 1, last, isFilterContext, filterSource);
+                } else {
+                    throwStandardException();
+                }
+            } else if (isFunctionCall(from, to)) {
+                int closingBracketIndex = findClosingIndex(from + 1, to);
+                Collection funcValue = evaluateFunction(object, from, to, position, last, isFilterContext);
+                return evaluateAgainst(funcValue, closingBracketIndex + 1, to, false, 1, last, isFilterContext, filterSource);
+            } else if (isValidInteger(tokenArray[from])) {
+                Collection value = singleton(Integer.valueOf(tokenArray[from]));
+                return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
+            } else if (isValidDouble(tokenArray[from])) {
+                Collection value = singleton(Double.valueOf(tokenArray[from]));
+                return evaluateAgainst(value, from + 1, to, false, position, last, isFilterContext, filterSource);
+            } else {
+                return getElementsByName(object, from, to, isRecursive, isFilterContext);
+            }
+        } else {
+           return object;
+        }
+
+        throw new XPatherException();
+    }
+
+    private String flatten(int from, int to) {
+        if (from <= to) {
+            StringBuffer result = new StringBuffer();
+            for (int i = from; i <= to; i++) {
+                result.append(tokenArray[i]);
+            }
+
+            return result.toString();
+        }
+
+        return "";
+    }
+
+	private static boolean isValidInteger(String value) {
+	    final int l = value.length();
+	    if(l > 0) {
+	        int i = 1, c = value.charAt(0);
+	        if(c == CP || c == CM || (c >= C0 && c <= C9)) {
+	            for (; i < l; i++) {
+	                c = value.charAt(i);
+	                if (c < C0 || c > C9)
+	                    return false;
+	            }
+	            return true;
+	        }
+	    }
+	    return false;
+	}
+
+	private boolean isValidDouble(String value) {
+	    final int l = value.length();
+	    if(l > 0) {
+	        int i = 1, c = value.charAt(0);
+	        if(c == CP || c == CM || c == CS || (c >= C0 && c <= C9)) {
+	            for (; i < l; i++) {
+	                c = value.charAt(i);
+	                if (c != CD && (c < C0 || c > C9)) 
+	                    return false;
+	            }
+	            return true;
+	        }
+	    }
+	    return false;
+	}
+
+    /**
+     * Checks if given string is valid identifier.
+     * @param s
+     */
+    private boolean isIdentifier(String s) {
+        if (s == null) {
+            return false;
+        }
+
+        s = s.trim();
+        if (s.length() > 0) {
+            if ( !Character.isLetter(s.charAt(0)) ) {
+                return false;
+            }
+            for (int i = 1; i < s.length(); i++) {
+                final char ch = s.charAt(i);
+                if ( ch != '_' && ch != '-' && !Character.isLetterOrDigit(ch) ) {
+                    return false;
+                }
+            }
+        }
+
+        return false;
+    }
+
+    /**
+     * Checks if tokens in specified range represents valid function call.
+     * @param from
+     * @param to
+     * @return True if it is valid function call, false otherwise.
+     */
+    private boolean isFunctionCall(int from, int to) {
+        if ( !isIdentifier(tokenArray[from]) && !isToken("(", from + 1) ) {
+            return false;
+        }
+
+        return findClosingIndex(from + 1, to) > from + 1;
+    }
+
+    /**
+     * Evaluates specified function.
+     * Currently, following XPath functions are supported: last, position, text, count, data
+     * @param source
+     * @param from
+     * @param to
+     * @param position
+     * @param last
+     * @return Collection as the result of evaluation.
+     */
+    protected Collection evaluateFunction(Collection source,
+                                        int from,
+                                        int to,
+                                        int position,
+                                        int last,
+                                        boolean isFilterContext) throws XPatherException {
+        String name = tokenArray[from].trim();
+        ArrayList result = new ArrayList();
+
+        final int size = source.size();
+        Iterator iterator = source.iterator();
+        int index = 0;
+        while (iterator.hasNext()) {
+            Object curr = iterator.next();
+            index++;
+            if ( "last".equals(name) ) {
+                result.add( Integer.valueOf(isFilterContext ? last : size) );
+            } else if ( "position".equals(name) ) {
+                result.add( Integer.valueOf(isFilterContext ? position : index) );
+            } else if ( "text".equals(name) ) {
+                if (curr instanceof TagNode) {
+                    result.add( ((TagNode)curr).getText() );
+                } else if (curr instanceof String) {
+                    result.add( curr.toString() );
+                }
+            } else if ( "count".equals(name) ) {
+                Collection argumentEvaluated =
+                        evaluateAgainst(source, from + 2, to - 1, false, position, 0, isFilterContext, null);
+                result.add( Integer.valueOf(argumentEvaluated.size()) );
+            } else if ( "data".equals(name) ) {
+                Collection argumentEvaluated = evaluateAgainst(source, from + 2, to - 1, false, position, 0, isFilterContext, null);
+                Iterator it = argumentEvaluated.iterator();
+                while (it.hasNext()) {
+                    Object elem = it.next();
+                    if (elem instanceof TagNode) {
+                        result.add( ((TagNode)elem).getText() );
+                    } else if (elem instanceof String) {
+                        result.add( elem.toString() );
+                    }
+                }
+            } else {
+                throw new XPatherException("Unknown function " + name + "!");
+            }
+        }
+
+        return result;
+    }
+
+    /**
+     * Filter nodes satisfying the condition
+     * @param source
+     * @param from
+     * @param to
+     */
+    protected Collection filterByCondition(Collection source, int from, int to) throws XPatherException {
+        ArrayList result = new ArrayList();
+        Iterator iterator = source.iterator();
+        int index = 0;
+        int size = source.size();
+        while (iterator.hasNext()) {
+            Object curr = iterator.next();
+            index++;
+
+            ArrayList logicValueList = new ArrayList(evaluateAgainst(singleton(curr), from, to, false, index, size, true, singleton(curr)));
+            if (logicValueList.size() >= 1) {
+                Object first = logicValueList.get(0);
+                if (first instanceof Boolean) {
+                    if ( ((Boolean)first).booleanValue() ) {
+                        result.add(curr);
+                    }
+                } else if (first instanceof Integer) {
+                    if ( ((Integer)first).intValue() == index ) {
+                        result.add(curr);
+                    }
+                } else {
+                    result.add(curr);
+                }
+            }
+        }
+        return result;
+    }
+
+    private boolean isToken(String token, int index) {
+        int len = tokenArray.length;
+        return index >= 0 && index < len && tokenArray[index].trim().equals(token.trim());
+    }
+
+    /**
+     * @param from
+     * @param to
+     * @return matching closing index in the token array for the current token, or -1 if there is
+     * no closing token within expected bounds.
+     */
+    private int findClosingIndex(int from, int to) {
+        if (from < to) {
+            String currToken = tokenArray[from];
+
+            if ("\"".equals(currToken)) {
+                for (int i = from + 1; i <= to; i++) {
+                    if ("\"".equals(tokenArray[i])) {
+                        return i;
+                    }
+                }
+            } else if ("'".equals(currToken)) {
+                for (int i = from + 1; i <= to; i++) {
+                    if ("'".equals(tokenArray[i])) {
+                        return i;
+                    }
+                }
+            } else if ( "(".equals(currToken) || "[".equals(currToken) || "/".equals(currToken) ) {
+                boolean isQuoteClosed = true;
+                boolean isAposClosed = true;
+                int brackets = "(".equals(currToken) ? 1 : 0;
+                int angleBrackets = "[".equals(currToken) ? 1 : 0;
+                int slashes = "/".equals(currToken) ? 1 : 0;
+                for (int i = from + 1; i <= to; i++) {
+                    if ( "\"".equals(tokenArray[i]) ) {
+                        isQuoteClosed = !isQuoteClosed;
+                    } else if ( "'".equals(tokenArray[i]) ) {
+                        isAposClosed = !isAposClosed;
+                    } else if ( "(".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
+                        brackets++;
+                    } else if ( ")".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
+                        brackets--;
+                    } else if ( "[".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
+                        angleBrackets++;
+                    } else if ( "]".equals(tokenArray[i]) && isQuoteClosed && isAposClosed ) {
+                        angleBrackets--;
+                    } else if ( "/".equals(tokenArray[i]) && isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0) {
+                        slashes--;
+                    }
+
+                    if (isQuoteClosed && isAposClosed && brackets == 0 && angleBrackets == 0 && slashes == 0) {
+                        return i;
+                    }
+                }
+            }
+
+        }
+
+        return -1;
+    }
+
+    /**
+     * Checks if token is attribute (starts with @)
+     * @param token
+     */
+    private boolean isAtt(String token) {
+        return token != null && token.length() > 1 && token.startsWith("@");
+    }
+
+    /**
+     * Creates one-element collection for the specified object.
+     * @param element
+     */
+    private Collection singleton(Object element) {
+        ArrayList result = new ArrayList();
+        result.add(element);
+        return result;
+    }
+
+    /**
+     * For the given source collection and specified name, returns collection of subnodes
+     * or attribute values.
+     * @param source
+     * @param from
+     * @param to
+     * @param isRecursive
+     * @return Colection of TagNode instances or collection of String instances.
+     */
+    private Collection getElementsByName(Collection source, int from, int to, boolean isRecursive, boolean isFilterContext) throws XPatherException {
+        String name = tokenArray[from].trim();
+
+        if (isAtt(name)) {
+            name = name.substring(1);
+            Collection result = new ArrayList();
+            Collection nodes;
+            if (isRecursive) {
+                nodes = new LinkedHashSet();
+                Iterator iterator = source.iterator();
+                while (iterator.hasNext()) {
+                    Object next = iterator.next();
+                    if (next instanceof TagNode) {
+                        TagNode node = (TagNode) next;
+                        nodes.addAll( node.getAllElementsList(true) );
+                    }
+                }
+            } else {
+                nodes = source;
+            }
+
+            Iterator iterator = nodes.iterator();
+            while (iterator.hasNext()) {
+                Object next = iterator.next();
+                if (next instanceof TagNode) {
+                    TagNode node = (TagNode) next;
+                    if ("*".equals(name)) {
+                        result.addAll( evaluateAgainst(node.getAttributes().values(), from + 1, to, false, 1, 1, isFilterContext, null) );
+                    } else {
+                        String attValue = node.getAttributeByName(name);
+                        if (attValue != null) {
+                            result.addAll( evaluateAgainst(singleton(attValue), from + 1, to, false, 1, 1, isFilterContext, null) );
+                        }
+                    }
+                } else {
+                    throwStandardException();
+                }
+            }
+            return result;
+        } else {
+            Collection result = new LinkedHashSet();
+            Iterator iterator = source.iterator();
+            int index = 0;
+            while (iterator.hasNext()) {
+                final Object next = iterator.next();
+                if (next instanceof TagNode) {
+                    TagNode node = (TagNode) next;
+                    index++;
+                    boolean isSelf = ".".equals(name);
+                    boolean isParent = "..".equals(name);
+                    boolean isAll = "*".equals(name);
+
+                    Collection subnodes;
+                    if (isSelf) {
+                        subnodes = singleton(node);
+                    } else if (isParent) {
+                        TagNode parent = node.getParent();
+                        subnodes = parent != null ? singleton(parent) : new ArrayList();
+                    } else {
+                        subnodes = isAll ? node.getChildTagList() : node.getElementListByName(name, false);
+                    }
+
+                    LinkedHashSet nodeSet = new LinkedHashSet(subnodes);
+                    Collection refinedSubnodes = evaluateAgainst(nodeSet, from + 1, to, false, index, nodeSet.size(), isFilterContext, null);
+
+                    if (isRecursive) {
+                        List childTags = node.getChildTagList();
+                        if (isSelf || isParent || isAll) {
+                            result.addAll(refinedSubnodes);
+                        }
+                        Iterator childIterator = childTags.iterator();
+                        while (childIterator.hasNext()) {
+                            TagNode childTag = (TagNode) childIterator.next();
+                            Collection childrenByName = getElementsByName(singleton(childTag), from, to, isRecursive, isFilterContext);
+                            if ( !isSelf && !isParent && !isAll && refinedSubnodes.contains(childTag) ) {
+                                result.add(childTag);
+                            }
+                            result.addAll(childrenByName);
+                        }
+                    } else {
+                        result.addAll(refinedSubnodes);
+                    }
+                } else {
+                    throwStandardException();
+                }
+            }
+            return result;
+        }
+    }
+
+    /**
+     * Evaluates logic operation on two collections.
+     * @param first
+     * @param second
+     * @param logicOperator
+     * @return Result of logic operation
+     */
+    protected boolean evaluateLogic(Collection first, Collection second, String logicOperator) {
+        if (first == null || first.size() == 0 || second == null || second.size() == 0) {
+            return false;
+        }
+        Object elem1 = first.iterator().next();
+        Object elem2 = second.iterator().next();
+        if (elem1 instanceof Number && elem2 instanceof Number) {
+            double d1 = ((Number)elem1).doubleValue();
+            double d2 = ((Number)elem2).doubleValue();
+            if ("=".equals(logicOperator)) {
+                return d1 == d2;
+            } else if ("<".equals(logicOperator)) {
+                return d1 < d2;
+            } else if (">".equals(logicOperator)) {
+                return d1 > d2;
+            } else if ("<=".equals(logicOperator)) {
+                return d1 <= d2;
+            } else if (">=".equals(logicOperator)) {
+                return d1 >= d2;
+            }
+        } else {
+            String s1 = toText(elem1);
+            String s2 = toText(elem2);
+            int result = s1.compareTo(s2);
+            if ("=".equals(logicOperator)) {
+                return result == 0;
+            } else if ("<".equals(logicOperator)) {
+                return result < 0;
+            } else if (">".equals(logicOperator)) {
+                return result > 0;
+            } else if ("<=".equals(logicOperator)) {
+                return result <= 0;
+            } else if (">=".equals(logicOperator)) {
+                return result >= 0;
+            }
+        }
+
+        return false;
+    }
+
+    private String toText(Object o) {
+        if (o == null) {
+            return "";
+        } if (o instanceof TagNode) {
+            return ((TagNode)o).getText().toString();
+        } else {
+            return o.toString();
+        }
+    }
+
+}
@@ -0,0 +1,62 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+
+/**
+ * <p>Exception that could occure during XPather evaluation.</p>
+ */
+public class XPatherException extends Exception {
+
+    public XPatherException() {
+        this("Error in evaluating XPath expression!");
+    }
+
+    public XPatherException(Throwable cause) {
+        super(cause);
+    }
+
+    public XPatherException(String message) {
+        super(message);
+    }
+
+    public XPatherException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+}
@@ -0,0 +1,313 @@
+/*  Copyright (c) 2006-2007, Vladimir Nikic
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+
+package org.htmlcleaner;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * <p>Abstract XML serializer - contains common logic for descendants.</p>
+ */
+public abstract class XmlSerializer extends Serializer {
+
+    public static final String XMLNS_NAMESPACE = "xmlns";
+
+	protected XmlSerializer(CleanerProperties props) {
+		super(props);
+    }
+	
+	private boolean creatingHtmlDom;
+	
+	 /**
+     * @param creatingHtmlDom the creatingHtmlDom to set
+     */
+    public void setCreatingHtmlDom(boolean creatingHtmlDom) {
+        this.creatingHtmlDom = creatingHtmlDom;
+    }
+
+    /**
+     * @return the creatingHtmlDom
+     */
+    public boolean isCreatingHtmlDom() {
+        return creatingHtmlDom;
+    }
+
+    /**
+     * @deprecated Use writeToStream() instead.
+     */
+    @Deprecated
+    public void writeXmlToStream(TagNode tagNode, OutputStream out, String charset) throws IOException {
+         super.writeToStream(tagNode, out, charset);
+    }
+
+    /**
+     * @deprecated Use writeToStream() instead.
+     */
+    @Deprecated
+    public void writeXmlToStream(TagNode tagNode, OutputStream out) throws IOException {
+         super.writeToStream(tagNode, out);
+    }
+
+    /**
+     * @deprecated Use writeToFile() instead.
+     */
+    @Deprecated
+    public void writeXmlToFile(TagNode tagNode, String fileName, String charset) throws IOException {
+        super.writeToFile(tagNode, fileName, charset);
+    }
+
+    /**
+     * @deprecated Use writeToFile() instead.
+     */
+    @Deprecated
+    public void writeXmlToFile(TagNode tagNode, String fileName) throws IOException {
+        super.writeToFile(tagNode, fileName);
+    }
+
+    /**
+     * @deprecated Use getAsString() instead.
+     */
+    @Deprecated
+    public String getXmlAsString(TagNode tagNode, String charset) {
+        return super.getAsString(tagNode, charset);
+    }
+
+    /**
+     * @deprecated Use getAsString() instead.
+     */
+    @Deprecated
+    public String getXmlAsString(TagNode tagNode) {
+        return super.getAsString(tagNode);
+    }
+
+    /**
+     * @deprecated Use write() instead.
+     */
+    @Deprecated
+    public void writeXml(TagNode tagNode, Writer writer, String charset) throws IOException {
+        super.write(tagNode, writer, charset);
+    }
+
+    protected String escapeXml(String xmlContent) {
+        return Utils.escapeXml(xmlContent, props, isCreatingHtmlDom());
+    }
+
+    protected boolean dontEscape(TagNode tagNode) {
+    	return props.isUseCdataFor(tagNode.getName());
+    }
+
+    protected boolean isMinimizedTagSyntax(TagNode tagNode) {
+        final TagInfo tagInfo = props.getTagInfoProvider().getTagInfo(tagNode.getName());
+        return tagNode.isEmpty() && (tagInfo == null || tagInfo.isMinimizedTagPermitted()) &&
+               ( props.isUseEmptyElementTags() || (tagInfo != null && tagInfo.isEmptyTag()) );
+    }
+    protected void serializeOpenTag(TagNode tagNode, Writer writer) throws IOException {
+        serializeOpenTag(tagNode, writer, true);
+    }
+    
+	/**
+	 * Serialize a CDATA section. If the context is a script or style tag, and
+	 * using CDATA for script and style is set to true, then we just write the
+	 * actual content, as the whole section is wrapped in CDATA tokens.
+	 * Otherwise we escape the content as if it were regular text.
+	 * 
+	 * @param item the CDATA instance
+	 * @param tagNode the TagNode within which the CDATA appears
+	 * @param writer the writer to output to
+	 * @throws IOException
+	 */
+	protected void serializeCData(CData item, TagNode tagNode, Writer writer) throws IOException{
+		if (dontEscape(tagNode)){
+			writer.write(item.getContentWithoutStartAndEndTokens());
+		} else {
+			writer.write(escapeXml(item.getContentWithStartAndEndTokens()));
+		}
+	}
+	
+	/**
+	 * Serialize a content token, escaping where necessary.
+	 * @param item the content token to serialize
+	 * @param tagNode the TagNode within which the content token appears
+	 * @param writer the writer to output to
+	 * @throws IOException
+	 */
+	protected void serializeContentToken(ContentNode item, TagNode tagNode, Writer writer) throws IOException {
+		if (dontEscape(tagNode)){            	
+			writer.write(item.getContent());
+		}else {
+			writer.write( escapeXml(item.getContent()) );
+		}     
+	}
+
+    protected void serializeOpenTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
+        if ( !isForbiddenTag(tagNode)) {
+            String tagName = tagNode.getName();
+            
+            //
+            // Ensure we use valid XML element names
+            //
+            tagName = Utils.sanitizeXmlIdentifier(tagName);
+            
+            Map<String, String> tagAtttributes = tagNode.getAttributes();
+
+            // always have head and body in newline
+            if (props.isAddNewlineToHeadAndBody() && isHeadOrBody(tagName)) {
+                writer.write("\n");
+            }
+
+            writer.write("<" + tagName);
+            Iterator<Map.Entry<String, String>> it = tagAtttributes.entrySet().iterator();
+            while (it.hasNext()) {
+                Map.Entry<String, String> entry = (Map.Entry<String,String>) it.next();
+                String attName = (String) entry.getKey();
+                String attValue = (String) entry.getValue();
+                serializeAttribute(tagNode, writer, attName, attValue);
+            }
+
+            if ( isMinimizedTagSyntax(tagNode) ) {
+                writer.write(" />");
+                if (newLine) {
+                    writer.write("\n");
+                }
+            } else if (dontEscape(tagNode)) {
+                // because we are not considering if the file is xhtml or html,
+                // we need to put a javascript comment in front of the CDATA in case this is NOT xhtml
+                writer.write(">");
+                if (!tagNode.getText().toString().startsWith(CData.SAFE_BEGIN_CDATA)) {
+                    writer.write(CData.SAFE_BEGIN_CDATA);
+                    //
+                    // Insert a newline after the CDATA start marker if there isn't
+                    // already a newline character there
+                    //
+                    if (!tagNode.getText().toString().equals("")){
+                    	char firstchar = tagNode.getText().toString().charAt(0);
+                    	if (firstchar != '\n' && firstchar !='\r') writer.write("\n");
+                    }
+                }
+            } else {
+                writer.write(">");
+            }
+        }
+    }
+    
+    /**
+     * @param tagNode
+     * @return true if the tag is forbidden
+     */
+    protected boolean isForbiddenTag(TagNode tagNode) {
+        // null tagName when rootNode is a dummy node.
+        // this happens when omitting the html envelope elements ( <html>, <head>, <body> elements )
+        String tagName = tagNode.getName();
+        return tagName == null;
+    }
+    
+    protected boolean isHeadOrBody(String tagName) {
+        return "head".equalsIgnoreCase(tagName) || "body".equalsIgnoreCase(tagName);
+    }
+    
+    /**
+     * This allows overriding to eliminate forbidden attributes (for example javascript attributes onclick, onblur, etc. )
+     * @param writer
+     * @param attName
+     * @param attValue
+     * @throws IOException
+     */
+    protected void serializeAttribute(TagNode tagNode, Writer writer, String attName, String attValue) throws IOException {
+    	//
+    	// For XML, we can't use the lax definition of attribute names used in HTML5, so
+    	// we have to replace any invalid ones with a generated attribute name, or skip
+    	// them entirely.
+    	//
+        if (!props.isAllowInvalidAttributeNames()){
+        	attName = Utils.sanitizeXmlIdentifier(attName, props.getInvalidXmlAttributeNamePrefix());
+        }
+    	
+        if (attName != null && (Utils.isValidXmlIdentifier(attName) || props.isAllowInvalidAttributeNames()) && !isForbiddenAttribute(tagNode, attName, attValue)) {
+            writer.write(" " + attName + "=\"" + escapeXml(attValue) + "\"");
+        }
+    }
+    
+    /**
+     * Override to add additional conditions.
+     * @param tagNode
+     * @param attName
+     * @param value
+     * @return true if the attribute should not be outputed.
+     */
+    protected boolean isForbiddenAttribute(TagNode tagNode, String attName, String value) {
+        return !props.isNamespacesAware() && (XMLNS_NAMESPACE.equals(attName) || attName.startsWith(XMLNS_NAMESPACE +":"));
+    }
+
+    protected void serializeEndTag(TagNode tagNode, Writer writer) throws IOException {
+       serializeEndTag(tagNode, writer, true);
+    }
+
+    protected void serializeEndTag(TagNode tagNode, Writer writer, boolean newLine) throws IOException {
+        if ( !isForbiddenTag(tagNode)) {
+            String tagName = tagNode.getName();
+            //
+            // Ensure we use valid XML element names
+            //
+            tagName = Utils.sanitizeXmlIdentifier(tagName);
+            if (dontEscape(tagNode)) {
+                // because we are not considering if the file is xhtml or html,
+                // we need to put a javascript comment in front of the CDATA in case this is NOT xhtml
+
+                if (!tagNode.getText().toString().trim().endsWith(CData.SAFE_END_CDATA)) {
+                	//
+                	// Insert a newline character before the CDATA end marker if there isn't one
+                	// already at the end of the tag node content
+                	//
+                	if (tagNode.getText().toString().length() > 0){
+                		char lastchar = tagNode.getText().toString().charAt(tagNode.getText().toString().length()-1);
+                		if (lastchar != '\n' && lastchar !='\r') writer.write("\n");
+                	}
+                	// Write the CDATA end marker
+                    writer.write(CData.SAFE_END_CDATA);
+                }
+            }
+
+            writer.write( "</" + tagName + ">" );
+
+            if (newLine) {
+                writer.write("\n");
+            }
+        }
+    }
+
+}
@@ -0,0 +1,48 @@
+package org.htmlcleaner;
+
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Depth-first node traversor. Use to iterate through all nodes under and including the specified root node.
+ * <p>
+ * This implementation does not use recursion, so a deep DOM does not risk blowing the stack.
+ * </p>
+ */
+public class XmlTraversor {
+    private XmlVisitor visitor;
+
+    /**
+     * Start a depth-first traverse of the root and all of its descendants.
+     * @param visitor Node visitor.
+     * @param root the root node point to traverse.
+     */
+    public static void traverse(XmlVisitor visitor, HtmlNode root) {
+        HtmlNode node = root;
+        int depth = 0;
+        
+        while (node != null) {
+            visitor.head(node, depth);
+            if ( node instanceof TagNode && ((TagNode)node).hasChildren() ) {
+                node = (HtmlNode)((TagNode)node).getAllChildren().get(0);
+                depth++;
+            } else {
+            	List<? extends BaseToken> siblings = node.getSiblings();
+            	Iterator<? extends BaseToken> it = siblings.iterator();
+                while (it.hasNext() && it.next() == null && depth > 0) {
+                    visitor.tail(node, depth);
+                    node = node.getParent();
+                    depth--;
+                }
+                visitor.tail(node, depth);
+                if (node == root)
+                    break;
+                if (it.hasNext()){
+                	node = (HtmlNode)it.next();
+                } else {
+                	node = null;
+                }
+            }
+        }
+    }
+}
@@ -0,0 +1,29 @@
+package org.htmlcleaner;
+
+/**
+ * Node visitor interface. Provide an implementing class to {@link XmlTraversor} to iterate through nodes.
+ * <p>
+ * This interface provides two methods, {@code head} and {@code tail}. The head method is called when the node is first
+ * seen, and the tail method when all of the node's children have been visited. As an example, head can be used to
+ * create a start tag for a node, and tail to create the end tag.
+ * </p>
+ */
+public interface XmlVisitor {
+    /**
+     * Callback for when a node is first visited.
+     *
+     * @param node the node being visited.
+     * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
+     * of that will have depth 1.
+     */
+    void head(HtmlNode node, int depth);
+
+    /**
+     * Callback for when a node is last visited, after all of its descendants have been visited.
+     *
+     * @param node the node being visited.
+     * @param depth the depth of the node, relative to the root node. E.g., the root node has depth 0, and a child node
+     * of that will have depth 1.
+     */
+    void tail(HtmlNode node, int depth);
+}
@@ -0,0 +1,124 @@
+/*  
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+package org.htmlcleaner.audit;
+
+/**
+ * Possible error codes (read messages) that cleaner uses to inform clients about reasons/actions that modification
+ * involves.
+ * @author Konstantin Burov (aectann@gmail.com)
+ */
+public enum ErrorType {
+
+    /**
+     * Tag which existence is <i>critical</i> for the current is missing. Most likely, current tag was pruned. Unlike
+     * the {@link #RequiredParentMissing} this reports the problem when cleaner removed the tag instead of creating as
+     * parent. See {@link org.htmlcleaner.TagInfo} for more detailed description of fatal and required tags.
+     * <p>
+     * <b>Example:</b>
+     * <ul>
+     * <li>&lt;option> tag without parent select
+     * <li>&lt;tr> tag without parent &lt;table>
+     * <li>...
+     * </ul>
+     */
+    FatalTagMissing,
+    /**
+     * The tag wasn't found on list of allowed tags, thus it was removed.
+     */
+    NotAllowedTag,
+    /**
+     * Missing parent tag was added for current (i.e. tbody for tr).
+     */
+    RequiredParentMissing,
+    /**
+     * No matching close token was found for the open tag. Tag was closed automatically.
+     * <p>
+     * <b>Example:</b>
+     * <p>
+     * &lt;p>Some text..
+     * <p>
+     * Unclosed &lt;p> tag.
+     */
+    UnclosedTag,
+    /**
+     * Second instance of an unique tag was found, most likely it was removed.
+     * <p>
+     * <b>Example:</b>
+     * <p>
+     * 
+     * <pre>
+     * &lt;head>
+     *    &lt;title>Some text&lt;/title>
+     *    &lt;title>Some more text&lt;/title>
+     * &lt;/head>
+     * <p>
+     * </pre>
+     */
+    UniqueTagDuplicated,
+    /**
+     * The tag was deprecated and current cleaner mode doesn't allows this. The tag was removed.
+     * <p>
+     * <b>Example:</b>
+     * <ul>
+     * <li>&lt;u>
+     * <li>&lt;s>
+     * <li>&lt;srtike>
+     * <li>....
+     * </ul>
+     */
+    Deprecated,
+    /**
+     * This tag have bad child that shouldn't be here. Thus the tag is closed automatically to avoid such inclusion.
+     * <p>
+     * <b>Example:</b>
+     * <p>
+     * &lt;p>Some text &lt;table>...&lt;/table>&lt;p>
+     * <p>
+     * &lt;table> is not allowed to be child of &lt;p>, thus &lt;p> is closed before the &lt;table>
+     */
+    UnpermittedChild,
+
+    /**
+     * The tag is unknown and current cleaner mode doesn't allows this. The tag was removed.
+     * <p>
+     * <b>Example:</b>
+     * <ul>
+     * <li>&lt;any>
+     * <li>&lt;tag>
+     * <li>....
+     * </ul>
+     */
+    Unknown
+}
@@ -0,0 +1,85 @@
+/*  
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+package org.htmlcleaner.audit;
+
+import org.htmlcleaner.TagNode;
+import org.htmlcleaner.conditional.ITagNodeCondition;
+
+/**
+ * Implementors can be registered on {@link org.htmlcleaner.CleanerProperties} to receive notifications about
+ * modifications made by html cleaner.
+ * 
+ * @author Konstantin Burov (aectann@gmail.com)
+ *
+ */
+public interface HtmlModificationListener {
+
+    /**
+     * Fired when cleaner fixes some error in html syntax.
+     * 
+     * @param certain - true if change made doesn't hurts end document.
+     * @param tagNode - problematic node.
+     * @param errorType
+     */
+    void fireHtmlError(boolean certain, TagNode tagNode, ErrorType errorType);
+
+    /**
+     * Fired when cleaner fixes ugly html -- when syntax was correct but task was implemented by weird code.
+     * For example when deprecated tags are removed.
+     * 
+     * @param certainty - true if change made doesn't hurts end document.
+     * @param tagNode - problematic node.
+     * @param errorType
+     */
+    void fireUglyHtml(boolean certainty, TagNode tagNode, ErrorType errorType);
+
+    /**
+     * Fired when cleaner modifies html due to {@link ITagNodeCondition} match.
+     * 
+     * @param condition that was applied to make the modification
+     * @param tagNode - problematic node.
+     */
+    void fireConditionModification(ITagNodeCondition condition, TagNode tagNode);
+
+    /**
+     * Fired when cleaner modifies html due to user specified rules.
+     * 
+     * @param certainty - true if change made doesn't hurts end document.
+     * @param tagNode - problematic node.
+     * @param errorType
+     */
+    void fireUserDefinedModification(boolean certainty, TagNode tagNode, ErrorType errorType);
+
+}
@@ -0,0 +1,32 @@
+package org.htmlcleaner.audit;
+
+import java.util.logging.Logger;
+
+import org.htmlcleaner.TagNode;
+import org.htmlcleaner.conditional.ITagNodeCondition;
+
+public class HtmlModificationListenerLogger implements HtmlModificationListener {
+
+
+    private Logger log;
+
+    public HtmlModificationListenerLogger(Logger log) {
+        this.log = log;
+    }
+    public void fireConditionModification(ITagNodeCondition condition, TagNode tagNode) {
+        this.log.info("fireConditionModification:"+condition+" at "+tagNode);
+    }
+
+    public void fireHtmlError(boolean safety, TagNode tagNode, ErrorType errorType) {
+        this.log.info("fireHtmlError:"+errorType+"("+safety+") at "+tagNode);
+    }
+
+    public void fireUglyHtml(boolean safety, TagNode tagNode, ErrorType errorType) {
+        this.log.info("fireConditionModification:"+errorType+"("+safety+") at "+tagNode);
+    }
+
+    public void fireUserDefinedModification(boolean safety, TagNode tagNode, ErrorType errorType) {
+        this.log.info("fireConditionModification"+errorType+"("+safety+") at "+tagNode);
+    }
+
+}
@@ -0,0 +1,10 @@
+package org.htmlcleaner.conditional;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * Used as base for different node checkers.
+ */
+public interface ITagNodeCondition {
+    public boolean satisfy(TagNode tagNode);
+}
@@ -0,0 +1,12 @@
+package org.htmlcleaner.conditional;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * All nodes.
+ */
+public class TagAllCondition implements ITagNodeCondition {
+    public boolean satisfy(TagNode tagNode) {
+        return true;
+    }
+}
@@ -0,0 +1,18 @@
+package org.htmlcleaner.conditional;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * Checks if node contains specified attribute.
+ */
+public class TagNodeAttExistsCondition implements ITagNodeCondition {
+    private String attName;
+
+    public TagNodeAttExistsCondition(String attName) {
+        this.attName = attName;
+    }
+
+    public boolean satisfy(TagNode tagNode) {
+        return tagNode == null ? false : tagNode.getAttributes().containsKey( attName.toLowerCase() );
+    }
+}
@@ -0,0 +1,30 @@
+package org.htmlcleaner.conditional;
+
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * Checks if node has specified attribute with specified value.
+ */
+public class TagNodeAttNameValueRegexCondition implements ITagNodeCondition {
+    private Pattern attNameRegex;
+    private Pattern attValueRegex;
+
+    public TagNodeAttNameValueRegexCondition(Pattern attNameRegex, Pattern attValueRegex) {
+        this.attNameRegex = attNameRegex;
+        this.attValueRegex = attValueRegex;
+    }
+
+    public boolean satisfy(TagNode tagNode) {
+        if (tagNode != null ) {
+            for(Map.Entry<String, String>entry: tagNode.getAttributes().entrySet()) {
+                if ( (attNameRegex == null || attNameRegex.matcher(entry.getKey()).find()) && (attValueRegex == null || attValueRegex.matcher( entry.getValue() ).find())) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+}
@@ -0,0 +1,28 @@
+package org.htmlcleaner.conditional;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * Checks if node has specified attribute with specified value.
+ */
+public class TagNodeAttValueCondition implements ITagNodeCondition {
+    private String attName;
+    private String attValue;
+    private boolean isCaseSensitive;
+
+    public TagNodeAttValueCondition(String attName, String attValue, boolean isCaseSensitive) {
+        this.attName = attName;
+        this.attValue = attValue;
+        this.isCaseSensitive = isCaseSensitive;
+    }
+
+    public boolean satisfy(TagNode tagNode) {
+        if (tagNode == null || attName == null || attValue == null) {
+            return false;
+        } else {
+            return isCaseSensitive ?
+                    attValue.equals( tagNode.getAttributeByName(attName) ) :
+                    attValue.equalsIgnoreCase( tagNode.getAttributeByName(attName) );
+        }
+    }
+}
@@ -0,0 +1,25 @@
+package org.htmlcleaner.conditional;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * Remove empty autogenerated nodes. These nodes are created when an unclosed tag is immediately closed.
+ * @author patmoore
+ *
+ */
+public class TagNodeAutoGeneratedCondition implements ITagNodeCondition {
+
+    public static final TagNodeAutoGeneratedCondition INSTANCE = new TagNodeAutoGeneratedCondition();
+    /**
+     * @see org.htmlcleaner.conditional.ITagNodeCondition#satisfy(org.htmlcleaner.TagNode)
+     */
+    public boolean satisfy(TagNode tagNode) {
+        // auto-generated node that is not needed.
+        return tagNode.isAutoGenerated() && tagNode.isEmpty();
+    }
+
+    @Override
+    public String toString() {
+        return "auto generated tagNode";
+    }
+}
@@ -0,0 +1,94 @@
+package org.htmlcleaner.conditional;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.htmlcleaner.ContentNode;
+import org.htmlcleaner.ITagInfoProvider;
+import org.htmlcleaner.TagInfo;
+import org.htmlcleaner.TagNode;
+
+import static org.htmlcleaner.Utils.isEmptyString;
+import static org.htmlcleaner.Display.*;
+
+/**
+ * Checks if node is an <b>inline</b>  0r block element and has empty contents or white/non-breakable spaces only. Nodes that have
+ * non-empty id attribute are considered to be non-empty, since they can be used in javascript scenarios.
+ * 
+ * Examples that should be pruned,
+ * <pre>
+ * <u>  </u>
+ * <table><tr><td></td</tr></table> 
+ * </pre>
+ * 
+ * Examples of code that should NOT be pruned:
+ * 
+ * <pre>
+ * <p><img/></p> - no content but image tags do not have text content.
+ * <table<tr><td/><td>hi</td></tr> - the first (empty) td is a placeholder so the second td is in the correct column 
+ * </pre>
+ * @author Konstantin Burov
+ */
+public class TagNodeEmptyContentCondition implements ITagNodeCondition {
+
+    private static final String ID_ATTRIBUTE_NAME = "id";
+
+    /**
+     * Removal of element from this set can affect layout too hard.
+     */
+    private static final Set < String > unsafeBlockElements = new HashSet < String >();
+
+    static {
+        // cannot just remove a td unless removing the entire row. td's are place holders
+        unsafeBlockElements.add("td");
+        unsafeBlockElements.add("th");
+    }
+    private ITagInfoProvider tagInfoProvider;
+
+    public TagNodeEmptyContentCondition(ITagInfoProvider provider) {
+        this.tagInfoProvider = provider;
+    }
+
+    public boolean satisfy(TagNode tagNode) {
+        return satisfy(tagNode, false);
+    }
+    private boolean satisfy(TagNode tagNode, boolean override) {
+        String name = tagNode.getName();
+        TagInfo tagInfo = tagInfoProvider.getTagInfo(name);
+        //Only _block_ elements can match.
+        if (tagInfo != null && !hasIdAttributeSet(tagNode) && none != tagInfo.getDisplay() && !tagInfo.isEmptyTag() && (override || !unsafeBlockElements.contains(name))) {
+            CharSequence contentString = tagNode.getText();
+            if(isEmptyString(contentString)) {
+                // even though there may be no text need to make sure all children are empty or can be pruned
+                if (tagNode.isEmpty()) {
+                    return true;
+                } else {
+                    for(Object child: tagNode.getAllChildren()) {
+                        // TODO : similar check as in tagNode.isEmpty() argues for a visitor pattern
+                        // but allow empty td, ths to be pruned.
+                        if ( child instanceof TagNode) {
+                            if (!satisfy((TagNode)child, true)) {
+                                return false;
+                            }
+                        } else if (child instanceof ContentNode ) {
+                            if ( !((ContentNode)child).isBlank()) {
+                                return false;
+                            }
+                        } else {
+                            return false;
+                        }
+                    }
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    private boolean hasIdAttributeSet(TagNode tagNode) {
+        Map < String, String > attributes = tagNode.getAttributes();
+        return !isEmptyString(attributes.get(ID_ATTRIBUTE_NAME));
+    }
+
+}
@@ -0,0 +1,47 @@
+package org.htmlcleaner.conditional;
+
+import java.util.List;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * Checks if node is an insignificant br tag -- is placed at the end or at the
+ * start of a block.
+ * 
+ * @author Konstantin Burov (aectann@gmail.com)
+ */
+public class TagNodeInsignificantBrCondition implements ITagNodeCondition {
+
+	private static final String BR_TAG = "br";
+	
+	public TagNodeInsignificantBrCondition() {
+	}
+
+	public boolean satisfy(TagNode tagNode) {
+		if (!isBrNode(tagNode)) {
+			return false;
+		}
+		TagNode parent = tagNode.getParent();
+		List children = parent.getAllChildren();
+		int brIndex = children.indexOf(tagNode);		
+		return checkSublist(0, brIndex, children) || checkSublist (brIndex, children.size(), children);
+	}
+
+	private boolean isBrNode(TagNode tagNode) {
+		return tagNode != null && BR_TAG.equals(tagNode.getName());
+	}
+
+	private boolean checkSublist(int start, int end, List list) {
+		List sublist = list.subList(start, end);
+		for (Object object : sublist) {
+			if(!(object instanceof TagNode)){
+				return false;
+			}
+			TagNode node = (TagNode) object;
+			if(!isBrNode(node)&&!node.isPruned()){
+				return false;
+			}
+		}
+		return true;
+	}
+}
@@ -0,0 +1,18 @@
+package org.htmlcleaner.conditional;
+
+import org.htmlcleaner.TagNode;
+
+/**
+ * Checks if node has specified name.
+ */
+public class TagNodeNameCondition implements ITagNodeCondition {
+    private String name;
+
+    public TagNodeNameCondition(String name) {
+        this.name = name;
+    }
+
+    public boolean satisfy(TagNode tagNode) {
+        return tagNode == null ? false : tagNode.getName().equalsIgnoreCase(this.name);
+    }
+}
@@ -0,0 +1,220 @@
+/*  Copyright (c) 2006-2013, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+package org.htmlcleaner;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.StringWriter;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerException;
+import javax.xml.transform.TransformerFactory;
+import javax.xml.transform.dom.DOMSource;
+import javax.xml.transform.stream.StreamResult;
+
+import org.jdom2.input.DOMBuilder;
+import org.jdom2.output.Format;
+import org.jdom2.output.XMLOutputter;
+import org.junit.Assert;
+import org.junit.Before;
+import org.w3c.dom.Document;
+
+import static org.junit.Assert.assertEquals;
+
+
+/**
+ * Abstract test class with utility methods
+ */
+public abstract class AbstractHtmlCleanerTest {
+	
+	protected HtmlCleaner cleaner;
+	protected Serializer serializer;
+	
+	@Before
+	public void setup(){
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(true);
+        cleanerProperties.setOmitDoctypeDeclaration(false);
+        cleanerProperties.setAdvancedXmlEscape(true);
+        cleanerProperties.setTranslateSpecialEntities(false);
+        cleanerProperties.setOmitComments(false);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+
+        cleaner = new HtmlCleaner(cleanerProperties);
+        serializer = new SimpleXmlSerializer(cleanerProperties);	
+	}
+
+	protected void assertCleaned(String initial, String expected) throws IOException {
+        TagNode node = cleaner.clean(initial);
+        StringWriter writer = new StringWriter();
+        serializer.write(node, writer, "UTF-8");
+        assertEquals(expected, writer.toString());
+	}
+	
+	protected void assertCleanedHtml(String initial, String expected) throws IOException {
+        TagNode node = cleaner.clean(initial);
+        StringWriter writer = new StringWriter();
+        Serializer ser = new SimpleHtmlSerializer(cleaner.getProperties());
+        ser.write(node, writer, "UTF-8");
+        assertEquals(expected, writer.toString());		
+	}
+	
+	protected void assertCleanedDom(String initial, String expected) throws Exception {
+		cleaner.getProperties().setOmitHtmlEnvelope(false);
+        TagNode node = cleaner.clean(initial);
+        StringWriter writer = new StringWriter();
+        DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
+	    Document document = domSerializer.createDOM(node);	    
+	    TransformerFactory tf = TransformerFactory.newInstance();
+	    Transformer transformer = tf.newTransformer();
+	    transformer.transform(new DOMSource(document), new StreamResult(writer));
+		String rawActual = writer.getBuffer().toString();
+
+		String[] lines = rawActual.split("\n");
+		StringWriter buffer = new StringWriter();
+		for (String line : lines) {
+			buffer.write(line.trim());
+			buffer.write("\n");
+		}
+		String actual = buffer.toString();
+	    actual = actual.substring(actual.indexOf("<body>\n")+7, actual.indexOf("</body>")).trim();
+	    assertEquals(expected, actual);
+	    cleaner.getProperties().setOmitHtmlEnvelope(true);
+	}
+	
+	protected void assertCleanedJDom(String initial, String expected) throws Exception {
+		boolean env = cleaner.getProperties().isOmitHtmlEnvelope();
+		cleaner.getProperties().setOmitHtmlEnvelope(false);
+        TagNode node = cleaner.clean(initial);
+        StringWriter writer = new StringWriter();
+        JDomSerializer domSerializer = new JDomSerializer(cleaner.getProperties());
+	    org.jdom2.Document document = domSerializer.createJDom(node);	
+	    XMLOutputter out = new XMLOutputter();
+	    out.output(document, writer);
+	    String actual = writer.getBuffer().toString();
+	    actual = actual.substring(actual.indexOf("<body>")+6, actual.indexOf("</body>"));
+	    assertEquals(expected, actual);
+	    cleaner.getProperties().setOmitHtmlEnvelope(env);
+	}
+
+	protected String readFile(String filename) throws IOException {
+		File file = new File(filename);
+		CharSequence content = Utils.readUrl(file.toURI().toURL(), "UTF-8");
+		return content.toString();
+	}
+	
+	public static final String HEADER =
+        "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; 
+		    //+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" "
+            //+ "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n";
+    private static final String HEADER_FULL = HEADER + "<html><head /><body>";
+    private static final String FOOTER = "</body></html>";
+
+    protected void assertHTML(String expected, String input) throws IOException {
+        StringWriter writer = new StringWriter();
+        serializer.write(cleaner.clean(input), writer, "UTF-8");
+    	String actual = writer.toString();
+    	
+        Assert.assertEquals(HEADER_FULL + expected + FOOTER, actual);
+    }
+
+	protected void assertHTMLWithHeader(String expected, String input) throws IOException {
+		StringWriter writer = new StringWriter();
+		serializer.write(cleaner.clean(input), writer, "UTF-8");
+		String actual = writer.toString();
+
+		Assert.assertEquals(HEADER + expected, actual);
+	}
+    
+    protected void assertHTMLUsingDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
+        DomSerializer ser = new DomSerializer(cleaner.getProperties());
+
+    	Document document = ser.createDOM(cleaner.clean(input));
+    	
+        DOMBuilder in = new DOMBuilder();
+    	org.jdom2.Document jdomDoc = in.build(document);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String actual = outputter.outputString(jdomDoc);
+    	
+        Assert.assertEquals(HEADER_FULL + expected + FOOTER + "\n", actual);
+    }
+    
+    protected void assertHTMLUsingJDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
+        JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
+
+    	org.jdom2.Document document = ser.createJDom(cleaner.clean(input));
+    	
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String actual = outputter.outputString(document);
+    	
+        Assert.assertEquals(HEADER_FULL + expected + FOOTER + "\n", actual);
+    }
+
+	protected void assertHTMLIncludingHeaderUsingJDomSerializer(String expected, String input) throws IOException, ParserConfigurationException {
+		JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
+
+		org.jdom2.Document document = ser.createJDom(cleaner.clean(input));
+
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String actual = outputter.outputString(document);
+
+		Assert.assertEquals(HEADER + expected + "\n", actual);
+	}
+    
+    protected String documentToString(
+    	    final Document doc)
+    	{
+    	    String ret = "";
+    	    final TransformerFactory tf = TransformerFactory.newInstance();
+    	    try
+    	    {
+    	        final Transformer transformer = tf.newTransformer();
+    	        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
+    	        transformer.setOutputProperty(OutputKeys.METHOD, "xml");
+    	        transformer.setOutputProperty(OutputKeys.INDENT, "yes");
+    	        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+    	        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
+    	        final StringWriter stringWriter = new StringWriter();
+    	        transformer.transform(new DOMSource(doc), new StreamResult(stringWriter));
+    	        ret = stringWriter.getBuffer().toString();
+    	    }
+    	    catch (TransformerException e)
+    	    {
+    	        System.err.println("Failed to toString document " + e);
+    	    }
+    	    return ret;
+    	}
+
+}
@@ -0,0 +1,37 @@
+package org.htmlcleaner;
+
+import junit.framework.TestCase;
+
+/**
+ * @author patmoore
+ * 
+ */
+public class BadTerminationTest extends TestCase {
+
+    public void testHandleGarbageInEndTag() throws Exception {
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitHtmlEnvelope(true);
+        cleanerProperties.setOmitXmlDeclaration(true);
+        cleanerProperties.setUseEmptyElementTags(false);
+
+        String output = new SimpleXmlSerializer(cleanerProperties).getAsString( "<div></div id=\"foo\">");
+        assertEquals("<div></div>", output);
+    }
+
+    // public void testWhiteSpaceInTag() throws Exception {
+    // String s =
+    // "<html><body><table width=\"838\" cellpadding=\"5\" cellspacing=\"0\">\n"
+    // +
+    // "                <tbody>\n" +
+    // "                <td width=\"704\"> </td>\n" +
+    // "                </tr\n" +
+    // "                ></tbody>< /table></bo dy>";
+    // CleanerProperties cleanerProperties = new CleanerProperties();
+    // cleanerProperties.setOmitHtmlEnvelope(false);
+    // cleanerProperties.setOmitXmlDeclaration(true);
+    // cleanerProperties.setUseEmptyElementTags(false);
+    // String output = new
+    // SimpleXmlSerializer().getXmlAsString(cleanerProperties, s, "UTF-8");
+    // assertEquals("<html><head></head><body><table width=\"838\" cellpadding=\"5\" cellspacing=\"0\"><tbody><tr><td width=\"704\"> </td></tr></tbody></table></body></html>",output);
+    // }
+}
@@ -0,0 +1,88 @@
+package org.htmlcleaner;
+
+import java.io.*;
+
+import junit.framework.*;
+
+/**
+ * Test cases for for {@link BrowserCompactXmlSerializer}
+ *
+ * @author Konstantin Burov (aectann@gmail.com)
+ *
+ */
+public class BrowserCompactXmlSerializerTest extends TestCase {
+
+    private BrowserCompactXmlSerializer compactXmlSerializer;
+    private CleanerProperties properties;
+
+    @Override
+    protected void setUp() throws Exception {
+        properties = new CleanerProperties();
+        properties.setOmitHtmlEnvelope(true);
+        properties.setOmitXmlDeclaration(true);
+        compactXmlSerializer = new BrowserCompactXmlSerializer(properties);
+    }
+
+
+    public void testInlineWhitespaceHandling(){
+      String cleaned = compactXmlSerializer.getAsString("<p>Test1 <a href=\"somelink\">Linktext</a> Test2</p>");
+      assertEquals("<p>Test1 <a href=\"somelink\">Linktext</a> Test2</p>\n", cleaned);
+
+      cleaned = compactXmlSerializer.getAsString("<p>Test1<a href=\"somelink\">Linktext</a>Test2</p>");
+      assertEquals("<p>Test1<a href=\"somelink\">Linktext</a>Test2</p>\n", cleaned);
+
+      cleaned = compactXmlSerializer.getAsString("one<br><b>two</b></br>three<b>four</b>");
+      assertEquals("one<br /><b>two</b>three<b>four</b>", cleaned);
+
+      cleaned = compactXmlSerializer.getAsString("one<br><b>two</b></br>three <b>four</b>");
+      assertEquals("one<br /><b>two</b>three <b>four</b>", cleaned);
+    }
+
+    /**
+     * Tests that serializer removes white spaces properly.
+     * @throws IOException
+     */
+    public void testRemoveInsignificantWhitespaces() throws IOException{
+        String cleaned = compactXmlSerializer.getAsString( "        <u>text here, </u><b>some text</b>      ");
+        assertEquals("<u>text here, </u><b>some text</b>", cleaned);
+        cleaned = compactXmlSerializer.getAsString( "    <div class=\"foo\">2 roots < here >  </div>");
+        assertEquals("<div class=\"foo\">2 roots &lt; here &gt;</div>\n", cleaned);
+        cleaned = compactXmlSerializer.getAsString( "    <div class=\"foo\">2 roots \n    < here >  </div>");
+        assertEquals("<div class=\"foo\">2 roots &lt; here &gt;</div>\n", cleaned);
+        cleaned = compactXmlSerializer.getAsString( "    <div class=\"foo\">2 roots \n\n    < here >  </div>");
+        assertEquals("<div class=\"foo\">2 roots <br />&lt; here &gt;</div>\n", cleaned);
+    }
+
+    /**
+     * Non-breakable spaces also must be removed from start and end.
+     * @throws IOException
+     */
+    public void testRemoveLeadingAndEndingNbsp() throws IOException {
+        String cleaned = compactXmlSerializer.getAsString(
+                "&nbsp;&nbsp;We have just released Jericho Road. Listen to Still Waters the lead-off track.");
+        assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
+        cleaned = compactXmlSerializer.getAsString(
+                "&#160;We have just released Jericho Road. Listen to Still Waters the lead-off track.&#160;");
+        assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
+        cleaned = compactXmlSerializer.getAsString(
+                "&#xA0;We have just released Jericho Road. Listen to Still Waters the lead-off track.&#xA0;");
+        assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
+        cleaned = compactXmlSerializer.getAsString( SpecialEntities.NON_BREAKABLE_SPACE
+                + "We have just released Jericho Road. Listen to Still Waters the lead-off track.&#xA0;"
+                + SpecialEntities.NON_BREAKABLE_SPACE);
+        assertEquals("We have just released Jericho Road. Listen to Still Waters the lead-off track.", cleaned);
+    }
+
+    /**
+     * Tests that contents of 'pre' tag are untouched.
+     * @throws IOException
+     */
+    public void testPreTagIsUntouched() throws IOException{
+        String cleaned = compactXmlSerializer.getAsString( "   <pre>some text</pre>");
+        assertEquals("<pre>some text</pre>\n", cleaned);
+        cleaned = compactXmlSerializer.getAsString( "<pre>     some text</pre>");
+        assertEquals("<pre>     some text</pre>\n", cleaned);
+        cleaned = compactXmlSerializer.getAsString( "<pre>some /n/n text</pre>");
+        assertEquals("<pre>some /n/n text</pre>\n", cleaned);
+    }
+}
@@ -0,0 +1,604 @@
+/*  Copyright (c) 2006-2013, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+package org.htmlcleaner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class CDATATest extends AbstractHtmlCleanerTest {
+	
+	/**
+	 * Test for bug #189
+	 * @throws Exception 
+	 */
+	@Test
+	public void UnclosedCDATA() throws Exception{
+		String html = "<script><![CDATA[";
+		String x = "";
+		for (int i = 0; i < 2048; i++){x+="x";};
+		html += x;
+		html += "</script><p>Test</p>";
+		
+		String expected = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script><p>Test</p>";
+		
+		cleaner.getProperties().setOmitHtmlEnvelope(true);
+		assertCleaned(html, expected);
+		cleaner.getProperties().setOmitHtmlEnvelope(false);
+	}
+	
+	/**
+	 * Test for bug #211
+	 * This passes, but is marked @Ignore because it takes a while to run. Comment
+	 * out ignore and run this test before making any builds.
+	 * @throws Exception 
+	 */
+	@Ignore
+	@Test
+	public void UnclosedCDATA2() throws Exception{
+		String html = "<script><![CDATA[";
+		String x = "";
+		for (int i = 0; i < 513*1024; i++){x+="x";};
+		html += x;
+		html += "</script><p>Test</p>";
+		
+		String expected = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script><p>Test</p>";
+		
+		cleaner.getProperties().setOmitHtmlEnvelope(true);
+		assertCleaned(html, expected);
+		cleaner.getProperties().setOmitHtmlEnvelope(false);
+	}
+	
+	
+	//
+	// Test for bug #185
+	//
+	@Test
+	public void noEndTokenLong() throws Exception{
+		String initial = "<script><![CDATA[";
+		String x = "";
+		for (int i = 0; i < 1024; i++){x+="x";};
+		String expected = "<script>"+x+"</script>";
+		String expectedXml = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script>";
+		
+		cleaner.getProperties().setOmitHtmlEnvelope(true);
+		assertCleanedHtml(initial+x, expected);
+		assertCleaned(initial+x, expectedXml);
+		assertCleanedJDom(initial+x, expectedXml);
+		cleaner.getProperties().setOmitHtmlEnvelope(false);
+	}
+	
+	//
+	// Test for bug #189
+	//
+	@Test
+	public void noEndTokenReallyLong() throws Exception{
+		String initial = "<script><![CDATA[";
+		String x = "";
+		for (int i = 0; i < 4096; i++){x+="x";};
+		String expected = "<script>"+x+"</script>";
+		String expectedXml = "<script>/*<![CDATA[*/\n" + x + "\n/*]]>*/</script>";
+		
+		cleaner.getProperties().setOmitHtmlEnvelope(true);
+		assertCleanedHtml(initial+x, expected);
+		assertCleaned(initial+x, expectedXml);
+		assertCleanedJDom(initial+x, expectedXml);
+		cleaner.getProperties().setOmitHtmlEnvelope(false);
+	}
+	
+	/**
+	 * This is to test issue #134
+	 * @throws IOException 
+	 */
+	@Test
+	public void strayEndTagInCDATA() throws IOException{
+		String initial = readFile("src/test/resources/test31.html");
+		cleaner.clean(initial);
+	}
+
+	/**
+	 * Tests that we escape CDATA in regular HTML content
+	 * @throws IOException
+	 */
+	@Test
+	public void NotReallyCData() throws IOException{
+		String initial = "<p><![CDATA ]]> is sometimes used in XML";
+		String expected = "<html>\n<head />\n<body><p>&lt;![CDATA ]]&gt; is sometimes used in XML</p></body></html>";
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * This is a simple no-op test; when we use a HTML serializer we don't
+	 * automatically wrap the contents of script tags in a CDATA, as we do with
+	 * the XML serializers
+	 * 
+	 * @throws IOException
+	 */
+	@Test
+	public void NoCData() throws IOException{
+		CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(true);
+        cleanerProperties.setOmitDoctypeDeclaration(true);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+        cleanerProperties.setUseCdataForScriptAndStyle(true);
+        this.cleaner = new HtmlCleaner(cleanerProperties);
+        this.serializer = new SimpleHtmlSerializer(cleaner.getProperties());
+        
+		String initial = "<html><head><script>function testNoOp(){<>}</script></head><body></body></html>";
+		String expected = initial;
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * In this test the script has no CDATA, an unescaped CDATAsection in a
+	 * script tag, and there is also an incorrect CDATA declaration in a
+	 * paragraph tag.
+	 * 
+	 * @throws IOException
+	 */
+    @Test
+    public void CDATAmixed() throws IOException{
+		String initial = readFile("src/test/resources/test11.html");
+		String expected = readFile("src/test/resources/test11_expected.html");
+		assertCleaned(initial, expected);
+    }
+    
+    @Test
+    public void CDATAandDocType() throws IOException{
+    	
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(false);
+        cleanerProperties.setOmitDoctypeDeclaration(false);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+        this.cleaner = new HtmlCleaner(cleanerProperties);
+        this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
+
+		String initial = readFile("src/test/resources/test12.html");
+		String expected = readFile("src/test/resources/test12_expected.html");
+
+		assertCleaned(initial, expected);
+    }
+
+    @Test
+    public void scriptAndCData() throws IOException
+    {
+
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(false);
+        cleanerProperties.setOmitDoctypeDeclaration(false);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+        cleanerProperties.setAddNewlineToHeadAndBody(false);
+        cleanerProperties.setUseCdataFor("script,style,altscript");
+        this.cleaner = new HtmlCleaner(cleanerProperties);
+        this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
+
+        
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
+        
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
+
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
+
+        assertHTMLWithHeader(
+				"<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
+        		"<style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style>");
+
+
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
+        "<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
+
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
+        "<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
+
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
+            "<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
+
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
+        "<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
+
+        assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n"
+                + "// \n"
+                + "function escapeForXML(origtext) {\n"
+                + "   return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+                + "       .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+                + "}\n"
+                + "// \n/*]]>*/"
+                + "</script>", "<script type=\"text/javascript\">\n"
+                + "// <![CDATA[\n"
+                + "function escapeForXML(origtext) {\n"
+                + "   return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+                + "       .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+                + "}\n"
+                + "// ]]>\n"
+                + "</script>");
+
+        assertHTML("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
+
+        assertHTML("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
+        
+        assertHTML(
+        		"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
+       		    "<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
+        );
+    }
+    
+    @Test
+    public void scriptAndCDataDom() throws IOException, ParserConfigurationException, Exception
+    {
+
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(false);
+        cleanerProperties.setOmitDoctypeDeclaration(false);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+        cleanerProperties.setAddNewlineToHeadAndBody(false);
+        cleanerProperties.setUseCdataFor("script,style,altscript");
+        this.cleaner = new HtmlCleaner(cleanerProperties);
+        
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
+        
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
+
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
+
+		assertHTMLIncludingHeaderUsingJDomSerializer(
+				"<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
+				"<html><head><style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style></head></html>"
+		);
+
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
+        "<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
+
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
+        "<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
+
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
+            "<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
+
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
+        "<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
+
+        assertHTMLUsingDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n"
+                + "// \n"
+                + "function escapeForXML(origtext) {\n"
+                + "   return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+                + "       .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+                + "}\n"
+                + "// \n/*]]>*/"
+                + "</script>", "<script type=\"text/javascript\">\n"
+                + "// <![CDATA[\n"
+                + "function escapeForXML(origtext) {\n"
+                + "   return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+                + "       .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+                + "}\n"
+                + "// ]]>\n"
+                + "</script>");
+
+        assertHTMLUsingDomSerializer("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
+
+        assertHTMLUsingDomSerializer("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
+        
+        assertHTMLUsingDomSerializer(
+        		"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
+       		    "<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
+        );
+    }
+    
+    @Test
+    public void scriptAndCDataJDom() throws IOException, ParserConfigurationException
+    {
+
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(false);
+        cleanerProperties.setOmitDoctypeDeclaration(false);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+        cleanerProperties.setAddNewlineToHeadAndBody(false);
+        cleanerProperties.setUseCdataFor("script,style,altscript");
+        this.cleaner = new HtmlCleaner(cleanerProperties);
+        
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
+        
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script>");
+
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>", 
+        		"<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
+
+		assertHTMLIncludingHeaderUsingJDomSerializer("<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>",
+        		"<style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style>");
+
+
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n// Comment \nalert(\"Hello World\")\n //\n/*]]>*/</script>",
+        "<script type=\"text/javascript\">// Comment \nalert(\"Hello World\")\n //\n</script>");
+
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script>",
+        "<script type=\"text/javascript\"><![CDATA[\nalert(\"Hello World\")\n]]></script>");
+
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
+            "<script type=\"text/javascript\"><![CDATA[\n//\nalert(\"Hello World\")\n// \n]]></script>");
+
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
+        "<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>");
+
+        assertHTMLUsingJDomSerializer("<script type=\"text/javascript\">/*<![CDATA[*/\n"
+                + "// \n"
+                + "function escapeForXML(origtext) {\n"
+                + "   return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+                + "       .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+                + "}\n"
+                + "// \n/*]]>*/"
+                + "</script>", "<script type=\"text/javascript\">\n"
+                + "// <![CDATA[\n"
+                + "function escapeForXML(origtext) {\n"
+                + "   return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+                + "       .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+                + "}\n"
+                + "// ]]>\n"
+                + "</script>");
+
+        assertHTMLUsingJDomSerializer("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script><></script>");
+
+        assertHTMLUsingJDomSerializer("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript><></altscript>");
+        
+        assertHTMLUsingJDomSerializer(
+        		"<script>/*<![CDATA[*/\nbanana(); //-->\n/*]]>*/</script><script>/*<![CDATA[*/\ntwo\n/*]]>*/</script>",
+       		    "<script>//<![CDATA[\nbanana(); //--></script><script>two</script>"
+        );
+    }
+
+    @Test
+    public void escapingCDATA() throws IOException{
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(false);
+        cleanerProperties.setOmitDoctypeDeclaration(false);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+        cleanerProperties.setAdvancedXmlEscape(true);
+        cleanerProperties.setAddNewlineToHeadAndBody(false);
+        cleanerProperties.setDeserializeEntities(true);
+        cleanerProperties.setUseCdataFor("script,style,altscript");
+        this.cleaner = new HtmlCleaner(cleanerProperties);
+        this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
+        assertHTML("<script>/*<![CDATA[*/\n<>\n/*]]>*/</script>", "<script>&lt;&gt;</script>");
+        assertHTML("<altscript>/*<![CDATA[*/\n<>\n/*]]>*/</altscript>", "<altscript>&lt;&gt;</altscript>");
+    }
+
+    @Test
+    public void removeCDATA() throws IOException{
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitCdataOutsideScriptAndStyle(true);
+        cleanerProperties.setAddNewlineToHeadAndBody(false);
+        cleanerProperties.setUseCdataFor("script,style,altscript");
+        cleaner = new HtmlCleaner(cleanerProperties);
+        serializer = new SimpleXmlSerializer(cleaner.getProperties());
+
+        // Verify that CDATA not inside SCRIPT or STYLE elements are considered comments in HTML and thus stripped
+        // when cleaned.
+        assertHTML("<p></p>", "<p><![CDATA[&]]></p>");
+        assertHTML("<p>&amp;&amp;</p>", "<p>&<![CDATA[&]]>&</p>");
+        assertHTML("<noaltscript />", "<noaltscript><![CDATA[&]]></noaltscript>");
+    }
+
+    /**
+     * Using the default setup, we should strip out CData outside
+     * of script and style tags.
+     */
+    @Test
+    public void CDATAinthewrongplace(){
+
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setIgnoreQuestAndExclam(true);
+
+        cleaner = new HtmlCleaner(cleanerProperties);
+
+    	String testData = ""
+        	+ "<p>"
+        	+ "<![CDATA[\n"
+        	+ "function helloWorld() {\n"
+        	+ "};\n"
+        	+ "]]>\n"
+        	+ "</p>";
+        	
+        	TagNode cleaned = cleaner.clean(testData);
+        	TagNode p = cleaned.findElementByName("p", true);
+        	
+        	//
+        	// We should have no CData nodes, instead the contents should
+        	// be processed as content and escaped as usual
+        	//
+        	assertTrue(p.getAllChildren().get(0) instanceof ContentNode);
+    }
+    
+    @Test
+    public void nonSafeCDATA(){
+    	String testData = ""
+        	+ "<script type=\"text/javascript\">"
+        	+ "<![CDATA[\n"
+        	+ "function helloWorld() {\n"
+        	+ "};\n"
+        	+ "]]>\n"
+        	+ "</script>";
+        	
+        	TagNode cleaned = cleaner.clean(testData);
+        	TagNode script = cleaned.findElementByName("script", true);
+        	
+        	
+        	//
+        	// We should have a CData node for the CDATA section
+        	//
+        	assertTrue(script.getAllChildren().get(0) instanceof CData);
+        	CData cdata = (CData)script.getAllChildren().get(0);
+        	
+        	String content = cdata.getContentWithoutStartAndEndTokens();
+        	assertEquals("\nfunction helloWorld() {\n};\n", content);
+    }
+    
+    @Test
+    public void safeOutput(){
+    	String testData = ""
+        	+ "<script type=\"text/javascript\">"
+        	+ "<![CDATA[\n"
+        	+ "function helloWorld() {\n"
+        	+ "};\n"
+        	+ "]]>\n"
+        	+ "</script>";
+        	
+        	TagNode cleaned = cleaner.clean(testData);
+        	TagNode script = cleaned.findElementByName("script", true);
+        	
+        	
+        	//
+        	// We should have a CData node for the CDATA section
+        	//
+        	assertTrue(script.getAllChildren().get(0) instanceof CData);
+        	CData cdata = (CData)script.getAllChildren().get(0);
+        	
+        	String content = cdata.getContentWithoutStartAndEndTokens();
+        	assertEquals("\nfunction helloWorld() {\n};\n", content);
+        	
+        	String safeContent = cdata.getContentWithStartAndEndTokens();
+        	assertEquals("/*<![CDATA[*/\nfunction helloWorld() {\n};\n/*]]>*/", safeContent);
+    }
+    
+    /**
+     * For a CDATA section we need to ignore '<' and '>' and keep going to keep the content
+     * within a single CData instance.
+     */
+    @Test
+    public void safeCDATAAlternate(){
+    	String testData = ""
+        	+ "<script type=\"text/javascript\">\n"
+        	+ "//<![CDATA[\n"
+        	+ "function escapeForXML(origtext) {\n"
+        	+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+        	+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+        	+ "}\n"
+        	+ "//]]>\n"
+        	+ "</script>";
+        	
+        	TagNode cleaned = cleaner.clean(testData);
+        	TagNode script = cleaned.findElementByName("script", true);
+        	
+        	
+        	//
+        	// We should have a CData node for the CDATA section
+        	//
+        	assertTrue(script.getAllChildren().get(1) instanceof CData);
+        	CData cdata = (CData)script.getAllChildren().get(1);
+        	
+        	String content = cdata.getContentWithoutStartAndEndTokens();
+        	assertEquals("\nfunction escapeForXML(origtext) {\n return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n .replace(/>/g,'&'+'gt;').replace(/'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');}\n", content);
+    }
+    
+    /**
+     * For a CDATA section we need to ignore '<' and '>' and keep going to keep the content
+     * within a single CData instance
+     */
+    @Test
+    public void safeCDATA(){
+    	String testData = ""
+        	+ "<script type=\"text/javascript\">\n"
+        	+ "/*<![CDATA[*/\n"
+        	+ "function escapeForXML(origtext) {\n"
+        	+ " return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n"
+        	+ " .replace(/>/g,'&'+'gt;').replace(/\'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');"
+        	+ "}\n"
+        	+ "/*]]>*/>\n"
+        	+ "</script>";
+        	
+        	TagNode cleaned = cleaner.clean(testData);
+        	TagNode script = cleaned.findElementByName("script", true);
+        	
+        	
+        	//
+        	// We should have a CData node for the CDATA section
+        	//
+        	assertTrue(script.getAllChildren().get(1) instanceof CData);
+        	CData cdata = (CData)script.getAllChildren().get(1);
+        	
+        	String content = cdata.getContentWithoutStartAndEndTokens();
+        	assertEquals("\nfunction escapeForXML(origtext) {\n return origtext.replace(/\\&/g,'&'+'amp;').replace(/</g,'&'+'lt;')\n .replace(/>/g,'&'+'gt;').replace(/'/g,'&'+'apos;').replace(/\"/g,'&'+'quot;');}\n", content);
+    }
+    
+    @Test
+    public void style(){
+    	String testData = "<style type=\"text/css\">/*<![CDATA[*/\n#ampmep_188 { }\n/*]]>*/</style>";
+    	TagNode cleaned = cleaner.clean(testData);
+    	TagNode style = cleaned.findElementByName("style", true);
+    	
+    	assertTrue(style.getAllChildren().get(0) instanceof CData);    	
+    	
+    	String content = (((CData)style.getAllChildren().get(0)).getContentWithoutStartAndEndTokens());
+
+    	assertEquals("\n#ampmep_188 { }\n", content);
+
+    }
+    
+    @Test
+    public void preserveComments() throws IOException{
+    	cleaner.getProperties().setOmitXmlDeclaration(false);
+    	String initial = readFile("src/test/resources/test17.html");
+    	String expected = readFile("src/test/resources/test17_expected.html");
+    	assertCleaned(initial, expected);
+    }
+    
+    @Test
+    public void preserveCommentsXwiki() throws IOException{
+    	cleaner.getProperties().setOmitXmlDeclaration(false);
+    	cleaner.getProperties().setAddNewlineToHeadAndBody(false);
+    	assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//\nalert(\"Hello World\")\n// \n/*]]>*/</script>",
+    			   "<script type=\"text/javascript\">//<![CDATA[\n//\nalert(\"Hello World\")\n// ]]></script>"
+    	);
+    }
+    
+    @Test
+    public void preserveComments2() throws IOException{
+    	cleaner.getProperties().setOmitXmlDeclaration(false);
+    	cleaner.getProperties().setAddNewlineToHeadAndBody(false);
+    	assertHTML("<script type=\"text/javascript\">/*<![CDATA[*/\n//alert(\"Hello World\")\n/*]]>*/</script>",
+    			   "<script type=\"text/javascript\"><![CDATA[//alert(\"Hello World\")]]></script>"
+    	);
+    }
+
+}
@@ -0,0 +1,125 @@
+package org.htmlcleaner;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests that tag closed due to one of its children (when the child tag is not allowed to be inside parent) is then
+ * reopened.
+ * Examples:
+ * <pre>
+ * <div><p>text1<table><tr><td>text2</td></tr></table>text3</p></div>
+ * </pre>
+ * table is not allowed inside a <p> most browsers handle this by placing the table close to line before and line after and in general allowing it.
+ * 
+ * Cleaning here normally would result in :
+ * <pre>
+ * <div><p>text1<table><tr><td>text2</td></tr></table>text3</div>
+ * </pre>
+ * 'text3' is no longer inside the original element type ( 'p' ). Instead 'text3' is now within a 'div'. 
+ * text3 would no longer be styled correctly.
+ * 
+ * A more correct result is:
+ * <pre>
+ * <div><p>text1<table><tr><td>text2</td></tr></table><p>text3</p></div>
+ * </pre>
+ */ 
+public class ClosedTagReopenTest extends TestCase {
+
+    public void testSimpleHTML4() throws IOException {
+        CleanerProperties properties = new CleanerProperties();
+        properties.setHtmlVersion(HtmlCleaner.HTML_4);
+        properties.setOmitXmlDeclaration(true);
+        properties.setOmitHtmlEnvelope(true);
+        SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
+        String[][] tests= {
+            new String[] { "<p>text1<table><tr><td>text2</td></tr></table>text3</p>", "<p>text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p>text3</p>" },
+            new String[] {"</p>text1","text1"},
+            new String[] {"<p>text1<div>text2</div>text3</p>", "<p>text1</p><div>text2</div><p>text3</p>"},
+            new String[] { "<div>text1<p>text2</p>text3</div>", "<div>text1<p>text2</p>text3</div>"},
+            new String[] {"<font>text1<p>text2</p>text3</font>", "<font>text1</font><p><font>text2</font></p><font>text3</font>"},
+            new String[] {"<p>text1<div>text2</div>text3<div>text4</div></p>", "<p>text1</p><div>text2</div><p>text3</p><div>text4</div>"},
+            new String[] {"<p>text1<div>text2</div></p>", "<p>text1</p><div>text2</div>"},     
+            new String[] {"<p>text1<p>text2</p></p>", "<p>text1</p><p>text2</p>"},
+            //test multiple internal breaks
+            new String[] {"<p><div>text1<p>text2<div>text3<p>text4<div>text5</div></p></div></p></div>","<p></p><div>text1<p>text2</p><div>text3<p>text4</p><div>text5</div></div></div>"},
+            // test attribute preservation
+            new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2</td></tr></table>text3</p>", 
+            "<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
+            // but not all attributes ( id attribute must be unique ) 
+            // TODO: maybe a generated id so that correlation can be found?
+            new String[] { "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1<table><tr><td>text2</td></tr></table>text3</p>", 
+            "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
+            // test multiple replacements
+            // test to see if nested good <p> can be handled.
+            new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p>text2a</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>", 
+                "<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p>text2a</p></td></tr></tbody></table>" +
+                "<p class=\"p_class\" random=\"attribute\">text3</p>" +
+                "<ul><li>text4</li></ul>" +
+                "<p class=\"p_class\" random=\"attribute\">text5</p>" +
+            "<ul><li>text6</li></ul>" },
+            new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p class=\"another_p_element\">text2a<div>test2b</div>test2c</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>", 
+                "<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p class=\"another_p_element\">text2a</p><div>test2b</div><p class=\"another_p_element\">test2c</p></td></tr></tbody></table>" +
+                "<p class=\"p_class\" random=\"attribute\">text3</p>" +
+                "<ul><li>text4</li></ul>" +
+                "<p class=\"p_class\" random=\"attribute\">text5</p>" +
+            "<ul><li>text6</li></ul>" },
+            new String[]{"<p>text1<table><tr><td>text2<tr><td>text3</table>text4</p>","<p>text1</p><table><tbody><tr><td>text2</td></tr><tr><td>text3</td></tr></tbody></table><p>text4</p>"}
+        };
+        for(String[] test: tests) {
+            String cleaned = serializer.getAsString(test[0]);
+            assertEquals("started with="+test[0], test[1], cleaned);
+        }
+    }
+    
+    @Test
+    public void testSimpleHTML5() throws IOException {
+    	 CleanerProperties properties = new CleanerProperties();
+         properties.setHtmlVersion(HtmlCleaner.HTML_5);
+         properties.setOmitXmlDeclaration(true);
+         properties.setOmitHtmlEnvelope(true);
+         SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
+        String[][] tests= {
+            new String[] { "<p>text1<table><tr><td>text2</td></tr></table>text3</p>", "<p>text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p>text3</p>" },
+            new String[] {"</p>text1","text1"},
+            new String[] {"<p>text1<div>text2</div>text3</p>", "<p>text1</p><div>text2</div><p>text3</p>"},
+            new String[] { "<div>text1<p>text2</p>text3</div>", "<div>text1<p>text2</p>text3</div>"},
+            new String[] {"text1<p>text2</p>text3", "text1<p>text2</p>text3"},
+            new String[] {"<p>text1<div>text2</div>text3<div>text4</div></p>", "<p>text1</p><div>text2</div><p>text3</p><div>text4</div>"},
+            new String[] {"<p>text1<div>text2</div></p>", "<p>text1</p><div>text2</div>"},     
+            new String[] {"<p>text1<p>text2</p></p>", "<p>text1</p><p>text2</p>"},
+            //test multiple internal breaks
+            new String[] {"<p><div>text1<p>text2<div>text3<p>text4<div>text5</div></p></div></p></div>","<p></p><div>text1<p>text2</p><div>text3<p>text4</p><div>text5</div></div></div>"},
+            // test attribute preservation
+            new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2</td></tr></table>text3</p>", 
+            "<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
+            // but not all attributes ( id attribute must be unique ) 
+            // TODO: maybe a generated id so that correlation can be found?
+            new String[] { "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1<table><tr><td>text2</td></tr></table>text3</p>", 
+            "<p class=\"p_class\" random=\"attribute\" id=\"just_me\">text1</p><table><tbody><tr><td>text2</td></tr></tbody></table><p class=\"p_class\" random=\"attribute\">text3</p>" },
+            // test multiple replacements
+            // test to see if nested good <p> can be handled.
+            new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p>text2a</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>", 
+                "<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p>text2a</p></td></tr></tbody></table>" +
+                "<p class=\"p_class\" random=\"attribute\">text3</p>" +
+                "<ul><li>text4</li></ul>" +
+                "<p class=\"p_class\" random=\"attribute\">text5</p>" +
+            "<ul><li>text6</li></ul>" },
+            new String[] { "<p class=\"p_class\" random=\"attribute\">text1<table><tr><td>text2<p class=\"another_p_element\">text2a<div>test2b</div>test2c</p></td></tr></table>text3<ul><li>text4</ul>text5<ul><li>text6</ul></p>", 
+                "<p class=\"p_class\" random=\"attribute\">text1</p><table><tbody><tr><td>text2<p class=\"another_p_element\">text2a</p><div>test2b</div><p class=\"another_p_element\">test2c</p></td></tr></tbody></table>" +
+                "<p class=\"p_class\" random=\"attribute\">text3</p>" +
+                "<ul><li>text4</li></ul>" +
+                "<p class=\"p_class\" random=\"attribute\">text5</p>" +
+            "<ul><li>text6</li></ul>" },
+            new String[]{"<p>text1<table><tr><td>text2<tr><td>text3</table>text4</p>","<p>text1</p><table><tbody><tr><td>text2</td></tr><tr><td>text3</td></tr></tbody></table><p>text4</p>"}
+        };
+        for(String[] test: tests) {
+            String cleaned = serializer.getAsString(test[0]);
+            assertEquals("started with="+test[0], test[1], cleaned);
+        }
+    }
+    
+}
@@ -0,0 +1,215 @@
+package org.htmlcleaner;
+
+import java.io.IOException;
+
+import org.htmlcleaner.conditional.TagNodeEmptyContentCondition;
+import org.htmlcleaner.conditional.TagNodeInsignificantBrCondition;
+
+import junit.framework.TestCase;
+
+/**
+ * Various tests for collapseNullHtml mode.
+ */
+public class CollapseHtmlTest extends TestCase {
+
+    /**
+     * 
+     */
+    private static final String CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR = "<tr><td></td><td>Cannot eliminate anything in this row</td></tr>";
+
+    /**
+     * 
+     */
+    private static final String IMAGE = "<img src=\"http://localhost:8080/img/foo.jpg\" />";
+
+    /**
+     * 
+     */
+    private static final String DONT_COLLAPSE = "<span>" + IMAGE + "</span>" + "<p>" + IMAGE + "</p>"
+            + "<p>bar<table><tr><td></td><td>" + IMAGE + "</td><td> </td></tr></table>foo</p>";
+    private static final String DONT_COLLAPSE_OUTPUT = "<span>" + IMAGE + "</span>" + "<p>" + IMAGE + "</p>"
+            + "<p>bar</p><table><tbody><tr><td></td><td>" + IMAGE + "</td><td> </td></tr></tbody></table><p>foo</p>";
+    private HtmlCleaner cleaner;
+
+    private CleanerProperties properties;
+
+    private SimpleXmlSerializer serializer;
+
+    @Override
+    protected void setUp() throws Exception {
+        cleaner = new HtmlCleaner();
+        properties = cleaner.getProperties();
+        properties.setOmitHtmlEnvelope(true);
+        properties.setOmitXmlDeclaration(true);
+        serializer = new SimpleXmlSerializer(properties);
+        properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
+        properties.addPruneTagNodeCondition(new TagNodeInsignificantBrCondition());
+    }
+
+    /**
+     * Make sure that single empty tag is dropped out.
+     * 
+     * @throws IOException
+     */
+    public void testCollapseSingleEmptyTag() throws IOException {
+        TagNode collapsed = cleaner.clean("<u></u>");
+        assertEquals("", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * Make sure that tags with internal blanks are collapsed.
+     */
+    public void testCollapseSingleTagWithBlanks() throws IOException {
+        TagNode collapsed = cleaner.clean("<u>   </u>");
+        assertEquals("", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<u> &#x20;  </u>");
+        assertEquals("", serializer.getAsString(collapsed));
+        // Strange msword insert
+        // collapsed =
+        // cleaner.clean("<span style='mso-spacerun:yes'>  </span>");
+        // assertEquals("", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * make sure that non-breaking spaces are also collapsed away.
+     */
+    public void testCollapseSingleTagWithNbsp() throws IOException {
+        TagNode collapsed = cleaner.clean("<u> &nbsp; </u>");
+        assertEquals("", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<u> &#160; </u>");
+        assertEquals("", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<u> &#xA0; </u>");
+        assertEquals("", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<u> " + SpecialEntities.NON_BREAKABLE_SPACE + " </u>");
+        assertEquals("", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * make sure that multiple null tags are collapsed.
+     */
+    public void testCollapseMultipleEmptyTags() throws IOException {
+        TagNode collapsed = cleaner.clean("<b><i><u></u></i></b>");
+        assertEquals("", serializer.getAsString(collapsed));
+
+        // test with slightly bad html.
+        collapsed = cleaner.clean("<b><i><u></i></u></b>");
+        assertEquals("", serializer.getAsString(collapsed));
+        // test with slightly bad html.
+        collapsed = cleaner.clean("<b><i><u></i></u>notme</b>");
+        assertEquals("<b>notme</b>", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * make sure that insignificant br tags are collapsed
+     */
+    public void testCollapseInsignificantBr() throws IOException {
+        TagNode collapsed = cleaner.clean("<p><br/>Some text</p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<p>Some text<BR/></p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<p><br/>Some<br/> text<br/></p>");
+        assertEquals("<p>Some<br /> text</p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<p><br/><br/>Some text <i>look here</i></p>");
+        assertEquals("<p>Some text <i>look here</i></p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("Some text<BR/>");
+        assertEquals("Some text", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * make sure TagTransformations do not interfere with collapse
+     */
+    public void testCollapseEmptyWithTagTransformations() throws IOException {
+        CleanerTransformations transformations = properties.getCleanerTransformations();
+        TagTransformation t = new TagTransformation("font", "span", true);
+        t.addAttributeTransformation("style", "${style};font-family:${face};font-size:${size};color:${color};");
+        t.addAttributeTransformation("face");
+        t.addAttributeTransformation("size");
+        t.addAttributeTransformation("color");
+        t.addAttributeTransformation("name", "${face}_1");
+        transformations.addTransformation(t);
+        TagNode collapsed = cleaner.clean("<b><font face=\"Ariel\"><u></u></font></b>");
+        assertEquals("", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * test to make sure that multiple <br>
+     * elements are eliminated
+     */
+    public void testChainCollapseInsignificantBrs() throws IOException {
+        TagNode collapsed = cleaner.clean("<p><br/><br>Some<br>text<br/><br><br></p>");
+        assertEquals("<p>Some<br />text</p>", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * make sure that intervening empty elements still cause unneeded <br>
+     * s to be eliminated.
+     */
+    public void testCollapseInsignificantBrWithEmptyElementsHTML4() throws IOException {
+    	properties.setHtmlVersion(HtmlCleaner.HTML_4);
+        properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
+    	TagNode collapsed = cleaner.clean("<p><span>&nbsp;</span><br/>Some text</p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u><big></big></u><BR/></p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u><big></big></u><BR/><u></u></p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+
+    }
+    
+    public void testCollapseInsignificantBrWithEmptyElementsHTML5() throws IOException {
+    	properties.setHtmlVersion(HtmlCleaner.HTML_5);
+        properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
+    	TagNode collapsed = cleaner.clean("<p><span>&nbsp;</span><br/>Some text</p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u></u><BR/></p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<p>Some text<br><span></span><BR/><u></u><BR/><u></u></p>");
+        assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+
+    }
+
+    /**
+     * Br nested in formating elements should be eliminated.
+     */
+    public void testInsureMeaninglessBrsStillCollapseEmptyElementsHTML4() throws IOException {
+    	properties.setHtmlVersion(HtmlCleaner.HTML_4);
+        properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
+    	TagNode collapsed;
+    	collapsed = cleaner.clean("<p><u><br/></u>Some text<br><span><BR/><u><big><BR/></big></u></p></span>");
+    	assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+    }
+    
+    
+    public void testInsureMeaninglessBrsStillCollapseEmptyElementsHTML5() throws IOException {
+    	properties.setHtmlVersion(HtmlCleaner.HTML_5);
+        properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
+       	TagNode collapsed;
+    	collapsed = cleaner.clean("<p><u><br/></u>Some text<br><span><BR/><u><BR/></u></p></span>");
+    	assertEquals("<p>Some text</p>", serializer.getAsString(collapsed));
+    }
+
+    /**
+     * because elements with ids can be referred to by javascript, don't assume
+     * that such elements can be eliminated.
+     */
+    public void testCollapseOnlyFormattingElementsWithNoIds() throws IOException {
+        TagNode collapsed = cleaner.clean("<b id=\"notme\"></b><span></span><span id=\"norme\"></span>");
+        assertEquals("<b id=\"notme\"></b><span id=\"norme\"></span>", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean("<b iD=\"notme\"></b><span></span><span ID=\"norme\"></span>");
+        assertEquals("<b id=\"notme\"></b><span id=\"norme\"></span>", serializer.getAsString(collapsed));
+    }
+
+    public void testCollapseAggressively() throws IOException {
+        properties.addPruneTagNodeCondition(new TagNodeEmptyContentCondition(properties.getTagInfoProvider()));
+        TagNode collapsed;
+        collapsed = cleaner.clean("<p><table><tr></tr><tr><td></td></tr></table></p>");
+        assertEquals("", serializer.getAsString(collapsed));
+        collapsed = cleaner.clean(DONT_COLLAPSE);
+        assertEquals(DONT_COLLAPSE_OUTPUT, serializer.getAsString(collapsed));
+        collapsed = cleaner
+                .clean("<p id=\"notme\"></p><table><tr></tr><tr><td>Nor me</td></tr><tr><td></td></tr><tr> </tr>"
+                        + "<tr>&nbsp;\n</tr>" + CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR + "</table>");
+        assertEquals("<p id=\"notme\"></p><table><tbody><tr><td>Nor me</td></tr>"
+                + CANNOT_ELIMINATE_ANYTHING_IN_THIS_TR + "</tbody></table>", serializer.getAsString(collapsed));
+    }
+}
@@ -0,0 +1,34 @@
+package org.htmlcleaner;
+
+import junit.framework.TestCase;
+
+import java.io.ByteArrayInputStream;
+
+/**
+ * Testing HtmlCleaner constructors.
+ */
+public class ConstructorTest extends TestCase {
+
+    public void testPropertiesConstructor() throws Exception {
+        CleanerProperties props = new CleanerProperties();
+        props.setOmitComments(true);
+
+        HtmlCleaner cleaner1 = new HtmlCleaner(props);
+        TagNode node1 = cleaner1.clean("<a href=index.htm><b><!--COMMENT 1--><b>text text<body>");
+        assertTrue( new SimpleXmlSerializer(props).getAsString(node1).indexOf("<!--COMMENT 1-->") < 0 );
+
+        HtmlCleaner cleaner2 = new HtmlCleaner(props);
+        TagNode node2 = cleaner2.clean("<span href=index1.htm><b><!--COMMENT 2--><x>DDDD text<body>");
+        assertTrue( new SimpleXmlSerializer(props).getAsString(node2).indexOf("<!--COMMENT 2-->") < 0 );
+
+        HtmlCleaner cleaner3 = new HtmlCleaner(props);
+        props.setOmitComments(false);
+        TagNode node3 = cleaner3.clean("<a href=index3.htm><b><!--COMMENT 3--><x>EEEEEEE text<body>");
+        assertTrue( new SimpleXmlSerializer(props).getAsString(node3).indexOf("<!--COMMENT 3-->") > 0 );
+
+        TagNode node4 = cleaner3.clean( new ByteArrayInputStream( ("FIRST" + (char)0x2 + (char)0x3 + "SECOND").getBytes() ), "ASCII" );
+        assertTrue( new CompactXmlSerializer(props).getAsString(node4).indexOf("FIRST  SECOND") >= 0 );
+
+    }
+
+}
@@ -0,0 +1,453 @@
+/*  Copyright (c) 2006-2013, HtmlCleaner project team (Vladimir Nikic, Scott Wilson, Pat Moore)
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+    You can contact Vladimir Nikic by sending e-mail to
+    nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
+    subject line.
+*/
+package org.htmlcleaner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.junit.Test;
+import org.w3c.dom.Document;
+
+public class DocTypesTest extends AbstractHtmlCleanerTest{
+
+	
+    @Test
+    public void DocTypeUsingDom() throws IOException, ParserConfigurationException{
+    	
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitXmlDeclaration(false);
+        cleanerProperties.setOmitDoctypeDeclaration(false);
+        cleanerProperties.setIgnoreQuestAndExclam(false);
+        cleaner = new HtmlCleaner(cleanerProperties);
+        
+        DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
+		String initial = readFile("src/test/resources/test12.html");
+        TagNode cleaned = cleaner.clean(initial);
+           
+        Document doc = domSerializer.createDOM(cleaned);
+
+        assertEquals("html", doc.getDoctype().getName());
+        assertEquals("-//W3C//DTD XHTML 1.0 Strict//EN", doc.getDoctype().getPublicId());	
+        assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", doc.getDoctype().getSystemId());	        
+    }
+    
+    // TODO remove and make this class a subclass of AbstractHtmlCleanerTest
+	protected String readFile(String filename) throws IOException {
+		File file = new File(filename);
+		CharSequence content = Utils.readUrl(file.toURI().toURL(), "UTF-8");
+		return content.toString();
+	}
+
+	
+	@Test
+	public void none() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE><html><body></body></html>");
+		assertEquals(null, cleaned.getDocType().getPart1());
+		assertEquals(null, cleaned.getDocType().getPart2());
+		assertEquals("", cleaned.getDocType().getPublicId());
+		assertEquals("", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+		serializer = new SimpleHtmlSerializer(cleaner.getProperties());
+		String out = serializer.getAsString(cleaned);
+		assertEquals(out, "<!DOCTYPE>\n<html><head></head><body></body></html>");
+		
+	}
+	
+	//
+	// Check all the valid doctypes
+	//
+	
+	@Test
+	public void html_5() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
+		assertEquals("html", cleaned.getDocType().getPart1());
+		assertEquals(null, cleaned.getDocType().getPart2());
+		assertEquals("", cleaned.getDocType().getPublicId());
+		assertEquals("", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML5, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_5_upper() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals(null, cleaned.getDocType().getPart2());
+		assertEquals("", cleaned.getDocType().getPublicId());
+		assertEquals("", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML5, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_5_legacy() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM \"about:legacy-compat\"><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("SYSTEM", cleaned.getDocType().getPart2());
+		assertEquals("about:legacy-compat", cleaned.getDocType().getPublicId());
+		assertEquals("", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML5_LEGACY_TOOL_COMPATIBLE, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_5_legacy_alternate() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM 'about:legacy-compat'><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("SYSTEM", cleaned.getDocType().getPart2());
+		assertEquals("about:legacy-compat", cleaned.getDocType().getPublicId());
+		assertEquals("", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML5_LEGACY_TOOL_COMPATIBLE, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+
+	@Test
+	public void html_4_0() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD HTML 4.0//EN", cleaned.getDocType().getPublicId());
+		assertEquals("", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_0_strict() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/REC-html40/strict.dtd\"><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD HTML 4.0//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/REC-html40/strict.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_strict_identifierOnly() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
+		assertEquals("", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_strict_mixed() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\" SYSTEM \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
+		assertEquals("html", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/html4/strict.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_strict() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD HTML 4.01//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/html4/strict.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_transitional() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD HTML 4.01 Transitional//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/html4/loose.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML4_01_TRANSITIONAL, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_frameset() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\" \"http://www.w3.org/TR/html4/frameset.dtd\"><html><body></body></html>");
+		assertEquals("HTML", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD HTML 4.01 Frameset//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/html4/frameset.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.HTML4_01_FRAMESET, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_strict() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
+		assertEquals("html", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD XHTML 1.0 Strict//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.XHTML1_0_STRICT, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_transitional() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html><body></body></html>");
+		assertEquals("html", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD XHTML 1.0 Transitional//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.XHTML1_0_TRANSITIONAL, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_frameset() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Frameset//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd\"><html><body></body></html>");
+		assertEquals("html", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD XHTML 1.0 Frameset//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.XHTML1_0_FRAMESET, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_1() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\"><html><body></body></html>");
+		assertEquals("html", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD XHTML 1.1//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_1_basic() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
+		assertEquals("html", cleaned.getDocType().getPart1());
+		assertEquals("PUBLIC", cleaned.getDocType().getPart2());
+		assertEquals("-//W3C//DTD XHTML Basic 1.1//EN", cleaned.getDocType().getPublicId());
+		assertEquals("http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd", cleaned.getDocType().getSystemId());
+		assertEquals(DoctypeToken.XHTML1_1_BASIC, cleaned.getDocType().getType());
+		assertTrue(cleaned.getDocType().isValid());
+	}
+	
+	//
+	// Now some invalid ones
+	//
+
+	@Test
+	public void empty() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE><html><body></body></html>");
+		assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void not_html() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE banana><html><body></body></html>");
+		assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+
+	@Test
+	public void html_4_0_wrong_id_type() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML SYSTEM \"-//W3C//DTD HTML 4.0//EN\"><html><body></body></html>");
+		assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_0_wrong_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
+		assertEquals(DoctypeToken.HTML4_0, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_wrong_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
+		assertEquals(DoctypeToken.HTML4_01_STRICT, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_transitional_bad_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
+		assertEquals(DoctypeToken.HTML4_01_TRANSITIONAL, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void html_4_01_frameset_bad_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\"><html><body></body></html>");
+		assertEquals(DoctypeToken.HTML4_01_FRAMESET, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+
+	@Test
+	public void xhtml_1_0_with_wrong_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
+		assertEquals(DoctypeToken.XHTML1_0_STRICT, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_0_transitional_with_wrong_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"><html><body></body></html>");
+		assertEquals(DoctypeToken.XHTML1_0_TRANSITIONAL, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_0_frameset_with_wrong_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Frameset//EN\"><html><body></body></html>");
+		assertEquals(DoctypeToken.XHTML1_0_FRAMESET, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_1_with_wrong_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml-basic11.dtd\"><html><body></body></html>");
+		assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void xhtml_1_1_with_no_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"><html><body></body></html>");
+		assertFalse(cleaned.getDocType().isValid());
+		assertEquals(DoctypeToken.XHTML1_1, cleaned.getDocType().getType());
+	}
+	
+	@Test
+	public void xhtml_1_1_basic_with_no_id() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
+		assertEquals(DoctypeToken.XHTML1_1_BASIC, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	@Test
+	public void weird_token() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html SILLY \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
+		assertEquals(DoctypeToken.UNKNOWN, cleaned.getDocType().getType());
+		assertFalse(cleaned.getDocType().isValid());
+	}
+	
+	//
+	// Serializer
+	//
+
+	@Test
+	public void html_4_01_serialize() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
+		String output = serializer.getAsString(cleaned);
+		assertTrue(output.startsWith("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"));
+	}
+	
+	@Test
+	public void html_4_01_domserialize() throws IOException, ParserConfigurationException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
+		DomSerializer domSerializer = new DomSerializer(cleaner.getProperties());
+		Document doc = domSerializer.createDOM(cleaned);
+		assertEquals("html", doc.getDocumentElement().getNodeName());
+        assertEquals("HTML", doc.getDoctype().getName());
+        assertEquals("-//W3C//DTD HTML 4.01//EN", doc.getDoctype().getPublicId());	
+        assertEquals("http://www.w3.org/TR/html4/strict.dtd", doc.getDoctype().getSystemId());	
+	}
+	
+	@Test
+	public void html_4_01_case_correct() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\"><html><body></body></html>");
+		String output = serializer.getAsString(cleaned);
+		assertTrue(output.startsWith("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"));
+	}
+	
+	@Test
+	public void xhtml_1_1_serialize() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\"><html><body></body></html>");
+		String output = serializer.getAsString(cleaned);
+		assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML Basic 1.1//EN\">"));
+	}
+	
+	@Test
+	public void xhtml_1_0_strict_serialize() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
+		String output = serializer.getAsString(cleaned);
+		assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"));
+	}
+	
+	@Test
+	public void xhtml_1_0_strict_serialize_case_correct() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html><body></body></html>");
+		String output = serializer.getAsString(cleaned);
+		assertTrue(output.startsWith("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"));
+	}
+	
+	@Test
+	public void html5_serialize() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
+		String output = serializer.getAsString(cleaned);
+		assertTrue(output.startsWith("<!DOCTYPE html>"));
+	}
+	
+	@Test
+	public void html5_serialize_case_correct() throws IOException{
+		TagNode cleaned = cleaner.clean("<!DOCTYPE HTML><html><body></body></html>");
+		String output = serializer.getAsString(cleaned);
+		assertTrue(output.startsWith("<!DOCTYPE html>"));
+	}
+	
+	
+	//
+	// Misc
+	//
+	
+	@Test
+	public void checkToString(){
+		TagNode cleaned = cleaner.clean("<!DOCTYPE html><html><body></body></html>");
+		assertEquals(cleaned.getDocType().getContent(), cleaned.getDocType().toString());
+	}
+}
@@ -0,0 +1,380 @@
+/*  Copyright (c) 2006-2019, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+package org.htmlcleaner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.IOException;
+
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.jdom2.input.DOMBuilder;
+import org.jdom2.output.Format;
+import org.jdom2.output.XMLOutputter;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.w3c.dom.Document;
+
+public class DomSerializerTest extends AbstractHtmlCleanerTest {
+	
+	@Test
+	public void removeInvalidTags3() throws Exception{
+	    String html="<p><^-^></p>";
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        final Document doc = new DomSerializer(cleanerProperties).createDOM(tagNode);
+        assertEquals("&lt;^-^&gt;", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
+	}
+	
+	@Test
+	public void attributeCharacters() throws Exception{
+	    String html="<p dispariție='dispariție.'></p>";
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setAllowInvalidAttributeNames(false);
+        DomSerializer ser = new DomSerializer(cleanerProperties);
+        ser.setXmlVersion("1.1");
+        final Document doc = ser.createDOM(tagNode);
+        assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
+	}
+	
+	@Test
+	public void attributeCharactersEncoded() throws Exception{
+	    String html="<p dispari\u021bie='dispari\u021bie.'></p>";
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setAllowInvalidAttributeNames(false);
+        DomSerializer ser = new DomSerializer(cleanerProperties);
+        ser.setXmlVersion("1.1");
+        final Document doc = ser.createDOM(tagNode);
+        assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
+	}
+	
+	@Test
+	public void attributeCharacters2() throws Exception{
+	    String html="<p t%st='dispariție.'></p>";
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setAllowInvalidAttributeNames(false);
+        final Document doc = new DomSerializer(cleanerProperties).createDOM(tagNode);
+        assertEquals("dispariție.", doc.getElementsByTagName("p").item(0).getAttributes().item(0).getTextContent());
+	}
+	
+	// See bug #203
+	@Test
+	public void parse2() throws Exception
+	{
+	    String html = "<div foo=\"aaa&quot;bbb&amp;ccc&gt;ddd&lt;eee\">content</div>";
+	    String expected = "<div foo=\"aaa&quot;bbb&amp;ccc&gt;ddd&lt;eee\">content</div>";
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+	    cleanerProperties.setOmitHtmlEnvelope(true);
+	    cleanerProperties.setOmitXmlDeclaration(true);
+	    String out = new SimpleXmlSerializer(cleanerProperties).getAsString(html);
+	    assertEquals(expected, out);
+	}
+	
+	// See bug #212
+	@Test
+	public void parse() throws Exception
+	{
+	    String html = "<?xml version = \"1.0\"?><img src=\"http://xwiki.org?a=&amp;b\"/>";
+	    String expected = "<img src=\"http://xwiki.org?a=&amp;b\" />";
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final Document doc = new DomSerializer(cleanerProperties, true).createDOM(tagNode);
+	    assertEquals("http://xwiki.org?a=&amp;b", 
+	    		doc.getElementsByTagName("img").item(0).getAttributes().getNamedItem("src").getTextContent());
+	    cleanerProperties.setOmitHtmlEnvelope(true);
+	    cleanerProperties.setOmitXmlDeclaration(true);
+	    String out = new SimpleXmlSerializer(cleanerProperties).getAsString(html);
+	    assertEquals(expected, out);
+	}
+	
+	@Test
+	public void removeInvalidTags() throws Exception{
+	    String html="<p><^-^></p>";
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
+        assertEquals("&lt;^-^&gt;", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
+	}
+	
+	@Test
+	public void removeInvalidTags2() throws Exception{
+	    String html="<p><1o/></p>";
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
+        assertEquals("&lt;1o/&gt;", doc.getElementsByTagName("p").item(0).getChildNodes().item(0).getTextContent());
+	}
+	
+	@Test
+	public void detectUnicodeSpaces() throws Exception{
+	    String html="<meta\u00A0property=\"test\" content=\"value\">";
+	    String expectedOutput= "test";
+	    final TagNode tagNode = new HtmlCleaner().clean(html);
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
+        assertEquals(expectedOutput, doc.getElementsByTagName("meta").item(0).getAttributes().getNamedItem("property").getTextContent());
+	}
+	
+	@Test
+	public void preserveUnicodeTest() throws Exception
+	{
+	    final String nonAsciiWord = "hemförsäkring";
+	    final String html = "<html>"
+	            + "<body>"
+	            + "<p>"
+	            + nonAsciiWord
+	            + "</p>"
+	            + "</body>"
+	            + "</html>";
+
+	    final String expectedOutput = 
+	            "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n"
+	            + "<html>\n" + 
+	            "    <head/>\n" + 
+	            "    <body>\n" + 
+	            "        <p>" + nonAsciiWord + "</p>\n" + 
+	            "    </body>\n" + 
+	            "</html>\n"
+	            + "";
+
+	        final TagNode tagNode = new HtmlCleaner().clean(html);
+	        final CleanerProperties cleanerProperties = new CleanerProperties();
+	        final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
+	        assertEquals(expectedOutput, documentToString(doc));
+	}
+	
+	// See Bug #215
+	@Test
+	public void invalidXMLElementName() throws ParserConfigurationException{
+		
+	    final String HTML = "<img srcset=\"<p%20\">";
+
+        final CleanerProperties cleanerProperties = new CleanerProperties();
+        //
+        // When we set allow to true, then we parse the attribute value as text
+        //
+        cleanerProperties.setAllowHtmlInsideAttributes(true);
+        TagNode tagNode = new HtmlCleaner(cleanerProperties).clean(HTML);
+        assertEquals(tagNode.getChildTags()[1].getChildTags()[0].getAttributeByName("srcset"),"<p%20");
+        //
+        // When we set allow to false, then we identify tags in attribute as new tags, and break
+        // into a new tag
+        //
+        cleanerProperties.setAllowHtmlInsideAttributes(false);
+        tagNode = new HtmlCleaner(cleanerProperties).clean(HTML);
+        
+        //
+        // Not an issue for HTML, which accepts pretty much anything in a tag name
+        //
+        cleanerProperties.setOmitXmlDeclaration(true);
+        String output = new SimpleHtmlSerializer(cleanerProperties).getAsString(tagNode);
+        assertEquals("<html><head></head><body><img srcset=\"\" /><p%20></p%20></body></html>", output);
+        
+        //
+        // But for XML DOM, we must follow the rules for building valid names, which means
+        // getting rid of the % sign
+        //
+        final Document doc = new DomSerializer(cleanerProperties, false).createDOM(tagNode);
+        assertEquals(1, doc.getDocumentElement().getElementsByTagName("p20").getLength());
+
+	}	
+	
+	@Test
+	public void errorChecking() throws ParserConfigurationException{
+		TagNode node = cleaner.clean("<p>");
+    	DomSerializer ser = new DomSerializer(cleaner.getProperties(), true, true, false);
+    	Document document = ser.createDocument(node);
+    	assertFalse(document.getStrictErrorChecking());
+	}
+    
+	/**
+	 * See issue 108
+	 * @throws IOException
+	 */
+    @Test
+    @Ignore
+    public void html5doctype() throws Exception{
+    	cleaner.getProperties().setUseCdataForScriptAndStyle(true);
+    	cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
+    	String initial = readFile("src/test/resources/test23.html");
+    	TagNode tagNode = cleaner.clean(initial);
+    	DomSerializer ser = new DomSerializer(cleaner.getProperties());
+    	Document dom = ser.createDOM(tagNode);
+    	assertNotNull(dom.getChildNodes().item(0).getChildNodes().item(0));
+    	assertEquals("head", dom.getChildNodes().item(0).getChildNodes().item(0).getNodeName());
+    }
+    
+	/**
+	 * See issue 127
+	 * @throws IOException
+	 */
+    @Test
+    public void rootNodeAttributes() throws Exception{
+    	cleaner.getProperties().setUseCdataForScriptAndStyle(true);
+    	cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
+    	String initial = readFile("src/test/resources/test29.html");
+    	TagNode tagNode = cleaner.clean(initial);
+    	DomSerializer ser = new DomSerializer(cleaner.getProperties());
+    	Document dom = ser.createDOM(tagNode);
+    	assertNotNull(dom.getChildNodes().item(0).getChildNodes().item(0));
+    	assertEquals("http://unknown.namespace.com", dom.getChildNodes().item(0).getAttributes().getNamedItem("xmlns").getNodeValue());
+    	assertEquals("27", dom.getChildNodes().item(0).getAttributes().getNamedItem("id").getNodeValue());
+    	//
+    	// Check we have a real ID attribute in the DOM and not just a regular attribute
+    	//
+    	assertEquals("http://unknown.namespace.com", dom.getElementById("27").getAttribute("xmlns"));
+    }
+    
+    @Test
+    public void cdata() throws Exception{
+    	cleaner.getProperties().setUseCdataForScriptAndStyle(true);
+    	cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
+    	String initial = "<script> this &gt; that </script>";
+    	TagNode tagNode = cleaner.clean(initial);
+    	DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), true);
+    	Document dom = ser.createDOM(tagNode);
+    	DOMBuilder in = new DOMBuilder();
+    	org.jdom2.Document jdomDoc = in.build(dom);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String actual = outputter.outputString(jdomDoc);
+        Assert.assertTrue(actual.contains("this > that"));
+    }
+    
+    @Test
+    public void cdata2() throws Exception{
+    	cleaner.getProperties().setUseCdataForScriptAndStyle(true);
+    	cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
+    	String initial = "<script> this &gt; that </script>";
+    	TagNode tagNode = cleaner.clean(initial);
+    	DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), false);
+    	Document dom = ser.createDOM(tagNode);
+    	DOMBuilder in = new DOMBuilder();
+    	org.jdom2.Document jdomDoc = in.build(dom);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String actual = outputter.outputString(jdomDoc);
+        Assert.assertTrue(actual.contains("this &gt; that"));
+    }
+    
+    @Test
+    public void escaping() throws Exception { 	
+		cleaner.getProperties().setTranslateSpecialEntities(true);
+		cleaner.getProperties().setAdvancedXmlEscape(true);
+		TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
+		DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
+		Document dom = ser.createDOM(tagNode);
+		String actual = dom.getElementsByTagName("div").item(0).getTextContent();
+		Assert.assertEquals(("£, £ and £"),actual);
+    }
+    
+    @Test
+    public void escaping_2() throws Exception {
+		cleaner.getProperties().setTranslateSpecialEntities(false);
+		TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
+		DomSerializer ser = new DomSerializer(cleaner.getProperties(), false);
+		Document dom = ser.createDOM(tagNode);
+		String actual = dom.getElementsByTagName("div").item(0).getTextContent();
+		Assert.assertEquals(("£, &pound; and &#163;"),actual);
+    }
+    
+    @Test
+    public void escaping_3() throws Exception {
+		cleaner.getProperties().setTranslateSpecialEntities(false);
+		TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
+		DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
+		Document dom = ser.createDOM(tagNode);
+		String actual = dom.getElementsByTagName("div").item(0).getTextContent();
+		Assert.assertEquals(("£, &pound; and £"),actual);
+    }
+    
+    @Test
+    public void escaping_4() throws Exception {
+		cleaner.getProperties().setRecognizeUnicodeChars(false);
+		TagNode tagNode = cleaner.clean("<div>£, &pound; and &#163;</div>");
+		DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
+		Document dom = ser.createDOM(tagNode);
+		String actual = dom.getElementsByTagName("div").item(0).getTextContent();
+		Assert.assertEquals(("£, &pound; and &pound;"),actual);
+    }
+    
+    @Test
+    public void escapingReservedCharactersTest() throws Exception {
+		cleaner.getProperties().setRecognizeUnicodeChars(false);
+		TagNode tagNode = cleaner.clean("<div>\" < > &</div>");
+		DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
+		Document dom = ser.createDOM(tagNode);
+		String actual = dom.getElementsByTagName("div").item(0).getTextContent();
+		Assert.assertEquals(("&quot; &lt; &gt; &amp;"),actual);
+    }
+
+    //
+    // We shouldn't escape any characters in a comment
+    //
+    @Test
+    public void escapingCommentsTest() throws Exception {
+		cleaner.getProperties().setRecognizeUnicodeChars(false);
+		TagNode tagNode = cleaner.clean("<div><!--\" \' < > &--></div>");
+		DomSerializer ser = new DomSerializer(cleaner.getProperties(), true);
+		Document dom = ser.createDOM(tagNode);
+		String actual = dom.getElementsByTagName("div").item(0).getChildNodes().item(0).getTextContent();
+		Assert.assertEquals(("\" \' < > &"),actual);
+    }
+
+    
+    @Test
+	public void ncr() throws Exception {
+
+		cleaner.getProperties().setOmitComments(true);
+		cleaner.getProperties().setNamespacesAware(false);
+		cleaner.getProperties().setUseCdataForScriptAndStyle(true);
+		cleaner.getProperties().setTranslateSpecialEntities(true);
+
+		TagNode tagNode = cleaner.clean("<div> &#8217; &#1078; &#253; &#247; &divide; </div>");
+		DomSerializer ser = new DomSerializer(cleaner.getProperties(), cleaner.getProperties().isAdvancedXmlEscape(), false);
+		Document dom = ser.createDOM(tagNode);
+		DOMBuilder in = new DOMBuilder();
+		org.jdom2.Document jdomDoc = in.build(dom);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String actual = outputter.outputString(jdomDoc);
+
+		Assert.assertTrue(actual.contains("’ ж ý ÷ ÷"));
+	}
+	
+}
@@ -0,0 +1,83 @@
+/*  Copyright (c) 2006-2014, the HtmlCleaner project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+package org.htmlcleaner;
+
+import junit.framework.TestCase;
+
+public class EntityDeserializationTest extends TestCase {
+
+    private HtmlCleaner cleaner;
+
+    @Override
+    public void setUp() {
+        CleanerProperties cp = new CleanerProperties();
+        cp.setDeserializeEntities(true);
+        cleaner = new HtmlCleaner(cp);
+    }
+
+    @Override
+    public void tearDown() {
+        cleaner = null;
+    }
+
+    private void doTest(String input, String output) {
+        assertEquals(
+                output,
+                cleaner.clean("<html><body>" + input + "</body></html>")
+                        .findElementByName("body", true)
+                        .getText()
+                        .toString()
+        );
+    }
+
+    public void testNamedEntity() {
+        doTest("&quot;", "\"");
+    }
+
+    public void testDecimalEntity() {
+        doTest("&#160;", "\u00a0");
+    }
+
+    public void testHexadecimalEntity() {
+        doTest("&#xa0;", "\u00a0");
+    }
+
+    public void testAbortedEntity() {
+        doTest("&amp&quot;", "&amp\"");
+    }
+
+    public void testCData() {
+        doTest("<script>"+CData.BEGIN_CDATA + "&amp;" + CData.END_CDATA+"</script>", "&amp;");
+    }
+
+}
@@ -0,0 +1,236 @@
+/*  Copyright (c) 2006-2013, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+package org.htmlcleaner;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.jdom2.Document;
+import org.jdom2.Namespace;
+import org.jdom2.output.Format;
+import org.jdom2.output.XMLOutputter;
+import org.junit.Test;
+
+public class JDomSerializerTest extends AbstractHtmlCleanerTest {
+
+	//
+	// Test that we create valid element names
+	//
+	@Test
+	public void elementNames() throws IOException{
+		String initial = "<img srcset=\"<p%20\">";
+		String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head /><body><img srcset=\"\" /><p20 /></body></html>\n";
+		CleanerProperties props = new CleanerProperties();
+		props.setAddNewlineToHeadAndBody(false);
+		TagNode tagNode = new HtmlCleaner(props).clean(initial);
+		Document doc = new JDomSerializer(props, true).createJDom(tagNode);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String output = outputter.outputString(doc);
+		assertEquals(expected, output);		
+	}
+	
+	/**
+	 * Tests that we comment CDATA in JDom
+	 * @throws IOException
+	 */
+	@Test
+	public void safeCData1() throws IOException{
+		String initial = "<head><script type=\"text/javascript\"><![CDATA[alert(\"Hello World\")]]></script></head>";
+		String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
+		CleanerProperties props = new CleanerProperties();
+		props.setOmitCdataOutsideScriptAndStyle(true);
+		props.setAddNewlineToHeadAndBody(false);
+		TagNode tagNode = new HtmlCleaner(props).clean(initial);
+		Document doc = new JDomSerializer(props, true).createJDom(tagNode);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String output = outputter.outputString(doc);
+		assertEquals(expected, output);		
+	}
+	
+	/**
+	 * Tests that we comment CDATA in JDom; in this case preserving existing comments
+	 * @throws IOException
+	 */
+	@Test
+	public void safeCData2() throws IOException{
+		String initial = "<head><script type=\"text/javascript\">//<![CDATA[\nalert(\"Hello World\")\n//]]></script></head>";
+		String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
+		CleanerProperties props = new CleanerProperties();
+		props.setOmitCdataOutsideScriptAndStyle(true);
+		props.setAddNewlineToHeadAndBody(false);
+		TagNode tagNode = new HtmlCleaner(props).clean(initial);
+		Document doc = new JDomSerializer(props, true).createJDom(tagNode);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String output = outputter.outputString(doc);
+		assertEquals(expected, output);		
+	}
+	
+	/**
+	 * Tests that we comment CDATA in JDom; in this case that we normalise comment style
+	 * @throws IOException
+	 */
+	@Test
+	public void safeCData3() throws IOException{
+		String initial = "<head><script type=\"text/javascript\">/*<![CDATA[*/alert(\"Hello World\")\n/*]]>*/</script></head>";
+		String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script type=\"text/javascript\">/*<![CDATA[*/\nalert(\"Hello World\")\n/*]]>*/</script></head><body /></html>\n";
+		CleanerProperties props = new CleanerProperties();
+		props.setOmitCdataOutsideScriptAndStyle(true);
+		props.setAddNewlineToHeadAndBody(false);
+		TagNode tagNode = new HtmlCleaner(props).clean(initial);
+		Document doc = new JDomSerializer(props, true).createJDom(tagNode);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String output = outputter.outputString(doc);
+		assertEquals(expected, output);		
+	}
+	
+	/**
+	 * Tests that we comment CDATA in JDom; in this case a more complex example
+	 * @throws IOException
+	 */
+	@Test
+	public void safeCData4() throws IOException{
+		String initial = readFile("src/test/resources/test33.html");
+		String expected = readFile("src/test/resources/test33_expected.html");;
+		CleanerProperties props = new CleanerProperties();
+		props.setOmitCdataOutsideScriptAndStyle(true);
+		props.setAddNewlineToHeadAndBody(false);
+		TagNode tagNode = new HtmlCleaner(props).clean(initial);
+		Document doc = new JDomSerializer(props, true).createJDom(tagNode);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String output = outputter.outputString(doc);
+		assertEquals(expected, output);		
+	}
+	
+	/**
+	 * Tests that we comment CDATA in JDom
+	 * @throws IOException
+	 */
+	@Test
+	public void safeCData5() throws IOException{
+		String initial = "<head><script>&lt;&gt;</script></head>";
+		String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><script>/*<![CDATA[*/\n<>\n/*]]>*/</script></head><body /></html>\n";
+		CleanerProperties props = new CleanerProperties();
+		props.setOmitCdataOutsideScriptAndStyle(true);
+		props.setUseCdataForScriptAndStyle(true);
+		props.setDeserializeEntities(true);
+		props.setAddNewlineToHeadAndBody(false);
+		TagNode tagNode = new HtmlCleaner(props).clean(initial);
+		Document doc = new JDomSerializer(props, true).createJDom(tagNode);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String output = outputter.outputString(doc);
+		assertEquals(expected, output);		
+	}
+	
+	/**
+	 * Tests that we comment CDATA in JDom; this test uses CSS 
+	 * @throws IOException
+	 */
+	@Test
+	public void safeCData6() throws IOException{
+		String initial = "<head><style type=\"text/css\"><![CDATA[\na { color: red; }\n]]></style></head>";
+		String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<html><head><style type=\"text/css\">/*<![CDATA[*/\na { color: red; }\n/*]]>*/</style></head><body /></html>\n";
+		CleanerProperties props = new CleanerProperties();
+		props.setOmitCdataOutsideScriptAndStyle(true);
+		props.setUseCdataForScriptAndStyle(true);
+		props.setAddNewlineToHeadAndBody(false);
+		TagNode tagNode = new HtmlCleaner(props).clean(initial);
+		Document doc = new JDomSerializer(props, true).createJDom(tagNode);
+		XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setEncoding("UTF-8").setLineSeparator("\n"));
+		String output = outputter.outputString(doc);
+		assertEquals(expected, output);		
+	}
+
+	/**
+	 * See issue #95
+	 */
+	@Test
+	public void testNPE(){
+		String validhtml5StringCode = "<html></html>";
+		CleanerProperties props = new CleanerProperties();
+		props.setOmitHtmlEnvelope(true);
+		TagNode tagNode = new HtmlCleaner(props).clean(validhtml5StringCode);
+		new JDomSerializer(props, true).createJDom(tagNode);
+	}
+	
+	/**
+	 * See issue 106
+	 * @throws IOException
+	 */
+    @Test
+    public void CDATA() throws Exception{
+    	cleaner.getProperties().setUseCdataForScriptAndStyle(true);
+    	cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
+    	String initial = readFile("src/test/resources/test22.html");
+    	TagNode tagNode = cleaner.clean(initial);
+    	JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
+    	Document doc = ser.createJDom(tagNode);
+    	assertEquals("org.jdom2.CDATA", doc.getRootElement().getChild("head").getChild("script").getContent().get(1).getClass().getName());
+    }
+    
+	/**
+	 * See issue 106
+	 * @throws IOException
+	 */
+    @Test
+    public void noCDATA() throws Exception{
+    	cleaner.getProperties().setUseCdataForScriptAndStyle(false);
+    	cleaner.getProperties().setOmitCdataOutsideScriptAndStyle(true);
+    	String initial = readFile("src/test/resources/test22.html");
+    	TagNode tagNode = cleaner.clean(initial);
+    	JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
+    	Document doc = ser.createJDom(tagNode);
+    	assertEquals("org.jdom2.Text", doc.getRootElement().getChild("head").getChild("script").getContent().get(0).getClass().getName());
+    }
+    
+    /**
+     * Test we handle foreign markup OK
+     * @throws Exception
+     */
+    @Test
+    public void namespaces() throws Exception{
+	    cleaner.getProperties().setNamespacesAware(true);
+		String initial = readFile("src/test/resources/test21.html");
+		TagNode tagNode = cleaner.clean(initial);
+		JDomSerializer ser = new JDomSerializer(cleaner.getProperties());
+		Document doc = ser.createJDom(tagNode);
+		
+		//
+		// These will fail with an NPE if the namespaces are not correct
+		//
+		doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getNamespaceURI();
+		doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getChild("svg", Namespace.getNamespace("http://www.w3.org/2000/svg")).getNamespaceURI();
+		doc.getRootElement().getChild("body", Namespace.getNamespace("http://www.w3.org/1999/xhtml")).getChild("svg", Namespace.getNamespace("http://www.w3.org/2000/svg")).getChild("title", Namespace.getNamespace("http://www.w3.org/2000/svg"));
+
+    }
+}
@@ -0,0 +1,63 @@
+/*  Copyright (c) 2006-2017, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+package org.htmlcleaner;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class MathMLTest extends AbstractHtmlCleanerTest{
+	
+	/**
+	 * Check that inline MathML statements remain inline. See bug #193
+	 * @throws IOException
+	 */
+	@Test
+	public void mathMLNamespaces() throws IOException{
+		String input = readFile("src/test/resources/test35.html");
+		String expected = readFile("src/test/resources/test35_expected.html");
+		assertCleaned(input,expected);
+	}
+	
+	/**
+	 * Check that MathML is properly formed. See bug #204
+	 * @throws IOException
+	 */
+	@Test
+	public void mathML() throws IOException{
+		String input = readFile("src/test/resources/test36.html");
+		String expected = readFile("src/test/resources/test36_expected.html");
+		assertCleaned(input,expected);
+	}
+
+}
@@ -0,0 +1,175 @@
+/*  Copyright (c) 2006-2013, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+
+package org.htmlcleaner;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class NamespacesTest  extends AbstractHtmlCleanerTest{
+
+
+	/**
+	 * Tests that we can add in the xlink NS declaration automatically if there is an xlink:href attribute with 
+	 * no xmlns attribute.
+	 * @throws IOException
+	 */
+	@Test
+	public void missingDeclaration() throws IOException{
+		String initial = "<p xlink:href=\"#someHeading\"/>";
+		String expected = "<html xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n<head />\n<body><p xlink:href=\"#someHeading\"></p></body></html>";
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * Tests that we can handle XMLNS="" attributes. See issue #135
+	 * @throws IOException
+	 */
+	@Test
+	public void xmlnsAttributeInUpperCase() throws IOException{
+		String initial = "<BANANA XMLNS=\"BANANA\"/>";
+		String expected = "<html>\n<head />\n<body><BANANA XMLNS=\"BANANA\" /></body></html>";
+		assertCleaned(initial, expected);
+	}
+	@Test
+	public void xmlnsAttributeAndPrefix() throws IOException{
+		String initial = "\n<head />\n<body><xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\"/>";
+		String expected = "<html>\n<head />\n<body>\n<xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\" /></body></html>";
+		assertCleaned(initial, expected);
+	}
+	@Test
+	public void xmlnsAttributeAndPrefix2() throws IOException{
+		String initial = "<xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\"/>";
+		String expected = "<html>\n<head />\n<body><xxx:BANANA xmlns:xxx=\"http://www.w3.org/1998/Math/MathML\" /></body></html>";
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * Tests that we can handle xmlns="" attributes. See issue #135
+	 * @throws IOException
+	 */
+	@Test
+	public void emptyNamespaces() throws IOException{
+		String initial = readFile("src/test/resources/test32.html");
+		String expected = "<html>\n<head />\n<body><a href=\"link.html\"><img /></a><p>Text</p></body></html>";
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * Uses an RDFa example to test that we retain namespace declarations. See issue #63
+	 * @throws IOException
+	 */
+    @Test
+    public void RDFa() throws IOException{
+		String initial = readFile("src/test/resources/test13.html");
+		String expected = readFile("src/test/resources/test13_expected.html");
+		assertCleaned(initial, expected);
+    }
+    
+    /**
+     * Uses a namespace prefix for an element. See issue #63
+     * @throws IOException
+     */
+    @Test
+    public void DCElement() throws IOException{
+		String initial = readFile("src/test/resources/test14.html");
+		String expected = readFile("src/test/resources/test14_expected.html");
+		assertCleaned(initial, expected);
+    }
+
+    /**
+     * Uses a namespace prefix for an attribute. See issue #63
+     * @throws IOException
+     */
+    @Test
+    public void DCAttribute() throws IOException{
+		String initial = readFile("src/test/resources/test15.html");
+		String expected = readFile("src/test/resources/test15_expected.html");
+		assertCleaned(initial, expected);
+    }
+    
+    /**
+     * If we aren't NS aware, strip out the xmlns attr and process everything 
+     * as HTML.
+     */
+	@Test
+	public void testTableCellsWithoutNamespaceAwareness() throws IOException{
+		cleaner.getProperties().setNamespacesAware(false);
+		String initial = readFile("src/test/resources/test26.html");
+		String expected = readFile("src/test/resources/test26_expected.html");
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * If we are namespace-aware and use the legacy HTML namespace, we should 
+	 * treat the content as HTML. See issue #115
+	 */
+	@Test
+	public void testTableCellsUsingNamespaceAwareAndLegacyHtmlNS() throws IOException{
+		cleaner.getProperties().setNamespacesAware(true);
+		cleaner.getProperties().setOmitUnknownTags(true);
+		String initial = readFile("src/test/resources/test26.html");
+		String expected = readFile("src/test/resources/test26_expected.html");
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * If we're NS-aware and using XHTML, treat the content as HTML tags and 
+	 * insert TBODY into the table (etc) but retain the xmlns attr on the html 
+	 * tag
+	 */
+	@Test
+	public void testTableCellsUsingNamespaceAwareAndXhtmlNS() throws IOException{
+		cleaner.getProperties().setNamespacesAware(true);
+		cleaner.getProperties().setOmitUnknownTags(true);
+		String initial = readFile("src/test/resources/test27.html");
+		String expected = readFile("src/test/resources/test27_expected.html");
+		assertCleaned(initial, expected);
+	}
+	
+	/**
+	 * If we are namespace-aware and use an unknown namespace,
+	 * all the content will be treated as foreign markup; this means
+	 * there will be no insertion of TBODY tags as the table element
+	 * is not interpreted as being a HTML table element
+	 */
+	@Test
+	public void testTableCellsUsingNamespaceAwareAndUnknownNS() throws IOException{
+		cleaner.getProperties().setNamespacesAware(true);
+		cleaner.getProperties().setOmitUnknownTags(true);
+		String initial = readFile("src/test/resources/test28.html");
+		String expected = readFile("src/test/resources/test28_expected.html");
+		assertCleaned(initial, expected);
+	}
+}
@@ -0,0 +1,34 @@
+package org.htmlcleaner;
+
+import junit.framework.TestCase;
+import org.junit.Test;
+
+public class NestingTest extends TestCase {
+
+    public final static int TOO_DEEP_NESTING = 9999;
+    public final static String TOO_DEEP_DOC = _nestedDoc(TOO_DEEP_NESTING, "<div>", "</div>", "");
+
+    public static String _nestedDoc(int nesting, String open, String close, String content) {
+        StringBuilder sb = new StringBuilder(nesting * (open.length() + close.length()));
+        for (int i = 0; i < nesting; ++i) {
+            sb.append(open);
+            if ((i & 31) == 0) {
+                sb.append("\n");
+            }
+        }
+        sb.append("\n").append(content).append("\n");
+        for (int i = 0; i < nesting; ++i) {
+            sb.append(close);
+            if ((i & 31) == 0) {
+                sb.append("\n");
+            }
+        }
+        return sb.toString();
+    }
+
+    @Test
+    public void testDeepNesting(){
+        HtmlCleaner cleaner = new HtmlCleaner();
+        TagNode root = cleaner.clean(TOO_DEEP_DOC);
+    }
+}
@@ -0,0 +1,663 @@
+package org.htmlcleaner;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.regex.Matcher;
+
+import junit.framework.TestCase;
+
+/**
+ * Testing node manipulation after cleaning.
+ * TODO String escaping tests should be moved to UtilsTest class [Eugene]
+ * @author Eugene Sapozhnikov (blackorangebox@gmail.com)
+ */
+public class PropertiesTest extends TestCase {
+	
+	/**
+	 * Test behavour of creating a new cleaner with properties including
+	 * tag provider set
+	 * @throws Exception
+	 */
+	public void initialiseCleanerWithProperties() throws Exception {
+		CleanerProperties properties = new CleanerProperties();
+		properties.setTagInfoProvider(Html5TagProvider.INSTANCE);
+		HtmlCleaner cleaner = new HtmlCleaner(properties);
+		assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
+		
+		properties = new CleanerProperties();
+		properties.setTagInfoProvider(null);
+		cleaner = new HtmlCleaner(properties);
+		assertTrue(cleaner.getTagInfoProvider() instanceof DefaultTagProvider);
+		
+		properties = new CleanerProperties();
+		properties.setTagInfoProvider(null);
+		cleaner = new HtmlCleaner(null,properties);
+		assertTrue(cleaner.getTagInfoProvider() instanceof DefaultTagProvider);
+		
+		properties = new CleanerProperties();
+		properties.setTagInfoProvider(null);
+		cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE, properties);
+		assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
+		
+		properties = new CleanerProperties();
+		properties.setTagInfoProvider(DefaultTagProvider.INSTANCE);
+		cleaner = new HtmlCleaner(Html5TagProvider.INSTANCE, properties);
+		assertTrue(cleaner.getTagInfoProvider() instanceof Html5TagProvider);
+	}
+
+    public void testPropertiesAdvancedXmlEscape() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        properties.setNamespacesAware(false);
+
+        String xmlString;
+        properties.setAdvancedXmlEscape(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div>&amp;&quot;&apos;&lt;&gt;</div>") >= 0);
+        properties.setAdvancedXmlEscape(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString, xmlString.indexOf("<div>&amp;amp;&amp;quot;&amp;apos;&amp;lt;&amp;gt;</div>") >= 0);
+    }
+
+    public void testUseCdataForScriptAndStyle() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setNamespacesAware(false);
+        properties.setAdvancedXmlEscape(false);
+        properties.setUseCdataForScriptAndStyle(true);
+        xmlString = getXmlString(cleaner, properties);
+        String expected = "<script>" + CData.SAFE_BEGIN_CDATA + "\nvar x=y&&z;\n" + CData.SAFE_END_CDATA
+                + "</script>";
+        assertTrue("looking for :\"" + expected + "\" in :\n" + xmlString, xmlString.indexOf(expected) >= 0);
+        expected = "<style>" + CData.SAFE_BEGIN_CDATA + "\n.test{font-size:10;}\n" + CData.SAFE_END_CDATA
+                + "</style>";
+        assertTrue("looking for :\"" + expected + "\" in :\n" + xmlString, xmlString.indexOf(expected) >= 0);
+        properties.setUseCdataForScriptAndStyle(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<script>var x=y&amp;&amp;z;</script>") >= 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<style>.test{font-size:10;}</style>") >= 0);
+    }
+
+    public void testTranslateSpecialEntities() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setTranslateSpecialEntities(true);
+        String specialHtmlEntities = "<div>" + new String(new char[] { 244, 8240, 215, 376, 8364 }) + "</div>";
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf(specialHtmlEntities) >= 0);
+        properties.setTranslateSpecialEntities(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf(specialHtmlEntities) < 0);
+    }
+
+    public void testRecognizeUnicodeChars() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        String unicodeCharString = "<div>" + new String(new char[] { 352, 8224, 8249 }) + "</div>";
+        properties.setRecognizeUnicodeChars(true);
+        assertTrue(getXmlString(cleaner, properties).indexOf(unicodeCharString) >= 0);
+        properties.setRecognizeUnicodeChars(false);
+        assertTrue(getXmlString(cleaner, properties).indexOf(unicodeCharString) < 0);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<div>&amp;#352;&amp;#8224;&amp;#8249;</div>") >= 0);
+    }
+
+    public void testOmitUnknownTags() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setOmitUnknownTags(true);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<mytag>content of unknown tag</mytag>") < 0);
+        assertTrue(getXmlString(cleaner, properties).indexOf("content of unknown tag") >= 0);
+        properties.setOmitUnknownTags(false);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<mytag>content of unknown tag</mytag>") >= 0);
+    }
+
+    public void testTreatUnknownTagsAsContent() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+        properties.setNamespacesAware(false);
+        properties.setOmitUnknownTags(false);
+        properties.setTreatUnknownTagsAsContent(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("&lt;mytag&gt;content of unknown tag&lt;/mytag&gt;") >= 0);
+        properties.setTreatUnknownTagsAsContent(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<mytag>content of unknown tag</mytag>") >= 0);
+    }
+
+    public void testNamespacesAware() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setNamespacesAware(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<html xmlns:my=\"my\">") >= 0);
+        assertTrue(xmlString.indexOf("<my:tag id=\"xxx\">aaa</my:tag>") >= 0);
+        properties.setNamespacesAware(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<html") >= 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<tag id=\"xxx\">aaa</tag>") >= 0);
+    }
+
+    public void testOmitDeprecatedTags() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setOmitDeprecatedTags(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") < 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("content of deprecated tag") >= 0);
+        properties.setOmitDeprecatedTags(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") >= 0);
+    }
+
+    public void testTreatDeprecatedTagsAsContent() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setOmitDeprecatedTags(false);
+        properties.setTreatDeprecatedTagsAsContent(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("&lt;u&gt;content of deprecated tag&lt;/u&gt;") >= 0);
+        properties.setTreatDeprecatedTagsAsContent(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<u>content of deprecated tag</u>") >= 0);
+    }
+
+    /**
+     * @throws IOException
+     */
+    public void testOmitComments() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        properties.setNamespacesAware(false);
+        properties.setOmitComments(false);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") >= 0);
+        properties.setOmitComments(true);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") < 0);
+    }
+
+    public void testUseEmptyElementTags() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        // Tag <a> connot be collapsed according to DefaultTagProvider
+        properties.setUseEmptyElementTags(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<a href=\"index.php\" />") < 0);
+        assertTrue(xmlString.indexOf("<a href=\"index.php\"></a>") >= 0);
+
+        properties.setUseEmptyElementTags(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<a href=\"index.php\"></a>") >= 0);
+
+        properties.setUseEmptyElementTags(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<br />") >= 0);
+
+        xmlString = getXmlString(cleaner, properties);
+        // jericho reports that td can not be empty. so we test on <tr/>
+        // collapsing
+        assertTrue(xmlString, xmlString.indexOf("<tr><td></td></tr><tr />") >= 0);
+        properties.setUseEmptyElementTags(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<table><tbody><tr><td></td></tr><tr></tr></tbody></table>") >= 0);
+    }
+
+    public void testAllowMultiWordAttributes() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+        properties.setUseEmptyElementTags(false);
+        properties.setAllowMultiWordAttributes(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div att=\"a b c\">") < 0);
+        assertTrue(xmlString.indexOf("<div att=\"a\" b=\"b\" c=\"c\">") >= 0);
+        properties.setAllowMultiWordAttributes(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div att=\"a b c\">") >= 0);
+
+        properties.setAllowHtmlInsideAttributes(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") >= 0);
+        properties.setAllowHtmlInsideAttributes(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") < 0);
+        assertTrue(xmlString.indexOf("<a title=\"\"><b>Title<b> is here&quot;&gt;LINK 1</b></b></a>") >= 0);
+
+        properties.setIgnoreQuestAndExclam(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") < 0);
+        assertTrue(xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") < 0);
+        properties.setIgnoreQuestAndExclam(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") >= 0);
+        assertTrue(xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") >= 0);
+
+        properties.setNamespacesAware(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<html xmlns:my=\"my\">") >= 0);
+        assertTrue(xmlString.indexOf("<my:tag id=\"xxx\">aaa</my:tag>") >= 0);
+        properties.setNamespacesAware(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<html") >= 0);
+        assertTrue(xmlString.indexOf("<tag id=\"xxx\">aaa</tag>") >= 0);
+    }
+    public void testAllowHtmlInsideAttributes() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setAllowHtmlInsideAttributes(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue( xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") >= 0 );
+        properties.setAllowHtmlInsideAttributes(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue( xmlString.indexOf("<a title=\"&lt;b&gt;Title&lt;b&gt; is here\">LINK 1</a>") < 0 );
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue( xmlString.indexOf("<a title=\"\"><b>Title<b> is here&quot;&gt;LINK 1</b></b></a>") >= 0 );
+    }
+    public void testIgnoreQuestAndExclam() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setIgnoreQuestAndExclam(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue( xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") < 0 );
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue( xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") < 0 );
+        properties.setIgnoreQuestAndExclam(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue( xmlString.indexOf("&lt;!INSTRUCTION1 id=&quot;aaa&quot;&gt;") >= 0 );
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue( xmlString.indexOf("&lt;?INSTRUCTION2 id=&quot;bbb&quot;&gt;") >= 0 );
+    }
+    /**
+     * @throws IOException
+     */
+    public void testComments() throws IOException {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        properties.setNamespacesAware(false);
+        properties.setOmitComments(false);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") >= 0);
+        properties.setOmitComments(true);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<!--my comment-->") < 0);
+
+        properties.setOmitComments(false);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<!-- comment with == - hyphen -->") >= 0);
+        properties.setHyphenReplacementInComment("*");
+        assertTrue(getXmlString(cleaner, properties).indexOf("<!-- comment with ** - hyphen -->") >= 0);
+    }
+
+    /**
+     * @throws IOException
+     */
+    public void testOmitXmlDeclaration() throws IOException {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        properties.setNamespacesAware(false);
+        properties.setOmitXmlDeclaration(false);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<?xml version=\"1.0\"") >= 0);
+        properties.setOmitXmlDeclaration(true);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<?xml version=\"1.0\"") < 0);
+    }
+
+    public void testOmitDoctypeDeclaration() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+        properties.setAdvancedXmlEscape(false);
+
+        properties.setOmitDoctypeDeclaration(false);
+        assertTrue(getXmlString(cleaner, properties).indexOf(
+                "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">") >= 0);
+        properties.setOmitDoctypeDeclaration(true);
+        assertTrue(getXmlString(cleaner, properties).indexOf(
+                "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">") < 0);
+    }
+
+    /**
+     * @throws IOException
+     */
+    public void testOmitHtmlEnvelope() throws IOException {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        properties.setHtmlVersion(4);
+        properties.setNamespacesAware(false);
+        properties.setAddNewlineToHeadAndBody(false);
+        String xmlString;
+        properties.setOmitHtmlEnvelope(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<html><head>") < 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("</body></html>") < 0);
+        properties.setOmitHtmlEnvelope(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString, xmlString.indexOf("<html><head>") >= 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString, xmlString.indexOf("</body></html>") >= 0);
+    }
+    
+    /**
+     * @throws IOException
+     */
+    public void testOmitHtml5Envelope() throws IOException {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        properties.setHtmlVersion(5);
+        properties.setNamespacesAware(false);
+        properties.setAddNewlineToHeadAndBody(false);
+        String xmlString;
+        properties.setOmitHtmlEnvelope(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<html><head>") < 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("</body></html>") < 0);
+        properties.setOmitHtmlEnvelope(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString, xmlString.indexOf("<html><head><style>") >= 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString, xmlString.indexOf("</body></html>") >= 0);
+    }
+
+    public void testPruneProperties() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+
+        properties.reset();
+        properties.setPruneTags("div,mytag");
+        String xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div") < 0);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<mytag") < 0);
+        properties.setPruneTags("");
+        properties.setAllowTags("html,body,div");
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div") >= 0);
+        assertTrue(getXmlString(cleaner, properties).indexOf("<mytag") < 0);
+    }
+
+    public void testEmptyAttributesProperties() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+
+        properties.reset();
+        String xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<input checked=\"checked\" />") >= 0);
+        properties.setBooleanAttributeValues("empty");
+        assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"\" />") >= 0);
+        properties.setBooleanAttributeValues("true");
+        assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"true\" />") >= 0);
+        properties.setBooleanAttributeValues("selft");
+        assertTrue(getXmlString(cleaner, properties).indexOf("<input checked=\"checked\" />") >= 0);
+    }
+
+    private String getXmlString(HtmlCleaner cleaner, CleanerProperties properties) throws IOException {
+        TagNode node = cleaner.clean(new File("src/test/resources/test4.html"), "UTF-8");
+        String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
+        return xmlString;
+    }
+
+    public void testNbsp() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        properties.setTranslateSpecialEntities(false);
+        properties.setOmitDoctypeDeclaration(false);
+        properties.setOmitXmlDeclaration(true);
+        properties.setAdvancedXmlEscape(true);
+        properties.setAddNewlineToHeadAndBody(false);
+
+        // test first when generating xml
+        TagNode node = cleaner.clean("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
+                + "<div>&#x20;&amp;&quot;&apos;'&lt;&gt;&nbsp;&garbage;&</div>");
+        SimpleXmlSerializer simpleXmlSerializer = new SimpleXmlSerializer(properties);
+        String xmlString = simpleXmlSerializer.getAsString(node, "UTF-8");
+        assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
+                + "<html><head /><body><div> &amp;&quot;&apos;&apos;&lt;&gt;" + String.valueOf((char) 160)
+                + "&amp;garbage;&amp;</div></body></html>", xmlString.trim());
+
+        simpleXmlSerializer.setCreatingHtmlDom(true);
+        // then test when generating html
+        String domString = simpleXmlSerializer.getAsString(node, "UTF-8");
+        assertEquals("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n" +
+        // "<html><head /><body><div> &amp;&quot;&#39;&#39;&lt;&gt;&nbsp;&amp;garbage;&amp;</div></body></html>",
+                "<html><head /><body><div> &amp;&quot;''&lt;&gt;&nbsp;&amp;garbage;&amp;</div></body></html>",
+                domString.trim());
+    }
+
+    /**
+     * make sure that the unicode character has leading 'x'.
+     * <ul>
+     * <li>&#138A; is converted by FF to 3 characters: &#138; + 'A' + ';'</li>
+     * <li>&#0x138A; is converted by FF to 6? 7? characters: &#0 'x'+'1'+'3'+
+     * '8' + 'A' + ';' #0 is displayed kind of weird</li>
+     * <li>&#x138A; is a single character</li>
+     * </ul>
+     * 
+     * @throws Exception
+     */
+    public void testHexConversion() throws Exception {
+        CleanerProperties properties = new CleanerProperties();
+        properties.setOmitHtmlEnvelope(true);
+        properties.setOmitXmlDeclaration(true);
+        SimpleXmlSerializer simpleXmlSerializer = new SimpleXmlSerializer(properties);
+        simpleXmlSerializer.setCreatingHtmlDom(false);
+
+        String xmlString = simpleXmlSerializer.getAsString( "<div>&#138A;</div>");
+        assertEquals("<div>"+new String(new char[] {138, 'A',';'})+"</div>", xmlString);
+        xmlString = simpleXmlSerializer.getAsString( "<div>&#x138A;</div>");
+        assertEquals("<div>"+new String(new char[] {0x138A})+"</div>", xmlString);
+        properties.reset();
+
+    }
+
+    public void testPattern() {
+        for (Object[] test : new Object[][] {
+                new Object[] { "0x138A;", false, -1, -1, null, true, 0, 7, "x138A", true, 0, 1, "0" },
+                new Object[] { "x138A;", true, 0, 6, "x138A", true, 0, 6, "x138A", false, -1, -1, null },
+                new Object[] { "138;", false, -1, -1, null, false, -1, -1, null, true, 0, 4, "138" },
+                new Object[] { "139", false, -1, -1, null, false, -1, -1, null, true, 0, 3, "139" },
+                new Object[] { "x13A", true, 0, 4, "x13A", true, 0, 4, "x13A", false, -1, -1, null },
+                new Object[] { "13F", false, -1, -1, null, false, -1, -1, null, true, 0, 2, "13" },
+                new Object[] { "13", false, -1, -1, null, false, -1, -1, null, true, 0, 2, "13" },
+                new Object[] { "X13AZ", true, 0, 4, "X13A", true, 0, 4, "X13A", false, -1, -1, null } }) {
+            int i = 0;
+            String input = (String) test[i++];
+            boolean strict = (Boolean) test[i++];
+            int sstart = (Integer) test[i++];
+            int send = (Integer) test[i++];
+            String sgroup = (String) test[i++];
+            boolean relaxed = (Boolean) test[i++];
+            int rstart = (Integer) test[i++];
+            int rend = (Integer) test[i++];
+            String rgroup = (String) test[i++];
+            boolean decimal = (Boolean) test[i++];
+            int dstart = (Integer) test[i++];
+            int dend = (Integer) test[i++];
+            String dgroup = (String) test[i++];
+            Matcher m = Utils.HEX_STRICT.matcher(input);
+            boolean actual = m.find();
+            assertEquals(input, strict, actual);
+            if (actual) {
+                assertEquals(input + " strict start ", sstart, m.start());
+                assertEquals(input + " strict end ", send, m.end());
+                assertEquals(input + " strict group ", sgroup, m.group(1));
+            }
+            m = Utils.HEX_RELAXED.matcher(input);
+            actual = m.find();
+            assertEquals(input, relaxed, actual);
+            if (actual) {
+                assertEquals(input + " relaxed start ", rstart, m.start());
+                assertEquals(input + " relaxed end ", rend, m.end());
+                assertEquals(input + " relaxed group ", rgroup, m.group(1));
+            }
+            m = Utils.DECIMAL.matcher(input);
+            actual = m.find();
+            assertEquals(input, decimal, actual);
+            if (actual) {
+                assertEquals(input + " decimal start ", dstart, m.start());
+                assertEquals(input + " decimal end ", dend, m.end());
+                assertEquals(input + " decimal group ", dgroup, m.group(1));
+            }
+        }
+    }
+
+    public void testConvertUnicode() throws Exception {
+        CleanerProperties cleanerProperties = new CleanerProperties();
+        cleanerProperties.setOmitHtmlEnvelope(true);
+        cleanerProperties.setOmitXmlDeclaration(true);
+        cleanerProperties.setUseEmptyElementTags(false);
+        // right tick is special unicode character 8217
+        String output = new SimpleXmlSerializer(cleanerProperties).getAsString(
+                "<h3><u><strong>President’s Message</strong></u><div> </h3>");
+        assertEquals("<h3><u><strong>President’s Message</strong></u><div> </div></h3>", output);
+    }
+
+    private static final String HTML_COMMENT_OUT_BEGIN = "<html><head><script>";
+    private static final String HTML_COMMENT_OUT_END = "</script></head><body></body></html>";
+    private static final String SAMPLE_JS = "var x = ['foo','bar'];";
+    private static final String COMMENT_START = "<!--";
+    private static final String COMMENT_END = "-->";
+
+    /**
+     * Test conversion of former ( now bad practice ) of:
+     * 
+     * <pre>
+     * &lt;style>&lt;!-- style info -->&lt;/style>
+     * </pre>
+     * 
+     * into &lt;style>/(star)&lt;![CDATA[(star)/ style info
+     * /(star)]]>(star)/&lt;/style>
+     * 
+     * Note: disabled because it doesn't test actual behavior
+     * @throws IOException 
+     */
+    public void disabledTestConvertOldStyleComments() throws IOException {
+        // TODO: May need additional flag to handle '<' inside of scripts
+        // dontEscape() in xml serializer should not be triggered based on use
+        // cdata
+        // but dontEscape is used by subclasses -- need to investigate best
+        // solution.
+        // maybe o.k. to have the < > be translated. That is what original test
+        // does.
+        // but the ' should probably not be touched??
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = new CleanerProperties();
+        properties.setOmitXmlDeclaration(true);
+        properties.setUseCdataForScriptAndStyle(true);
+        properties.setAddNewlineToHeadAndBody(false);
+        // test for positive matches to old-style comment hacks
+        for (String[] testData : new String[][] {
+                // normal case - remove old-style comment out hack
+                new String[] {
+                        HTML_COMMENT_OUT_BEGIN + "//" + COMMENT_START + "\n" + SAMPLE_JS + "//" + COMMENT_END + "\n"
+                                + HTML_COMMENT_OUT_END,
+                        HTML_COMMENT_OUT_BEGIN + CData.SAFE_BEGIN_CDATA + "\n" + SAMPLE_JS
+                                + CData.SAFE_END_CDATA + "\n" + HTML_COMMENT_OUT_END },
+                // don't let random whitespace confuse things
+                new String[] {
+                        HTML_COMMENT_OUT_BEGIN + "\n\n\n\n" + "//" + "   \t" + COMMENT_START + "\n" + SAMPLE_JS
+                                + "\n\n\n" + "//" + COMMENT_END + "\n\n\t\n" + HTML_COMMENT_OUT_END,
+                        HTML_COMMENT_OUT_BEGIN + "\n\n\n\n" + CData.SAFE_BEGIN_CDATA + "\n" + SAMPLE_JS
+                                + "\n\n\n" + "//" + CData.SAFE_END_CDATA + "\n\n\t\n" + HTML_COMMENT_OUT_END },
+
+        }) {
+            doTestConvertOldStyleComments(cleaner, properties, testData);
+        }
+
+        // test for false positives
+        for (String[] testData : new String[][] {
+        // make sure not to remove real comments
+        new String[] {
+                HTML_COMMENT_OUT_BEGIN + "//" + "an ordinary comment" + "\n" + SAMPLE_JS + "//" + "a final remark"
+                        + HTML_COMMENT_OUT_END,
+                HTML_COMMENT_OUT_BEGIN + CData.SAFE_BEGIN_CDATA + "//" + "an ordinary comment" + "\n"
+                        + SAMPLE_JS + "//" + "a final remark" + CData.SAFE_END_CDATA + HTML_COMMENT_OUT_END }, }) {
+            doTestConvertOldStyleComments(cleaner, properties, testData);
+        }
+    }
+
+    /**
+     * @param cleaner
+     * @param properties
+     * @param testData
+     */
+    private void doTestConvertOldStyleComments(HtmlCleaner cleaner, CleanerProperties properties, String[] testData)
+            throws IOException {
+        TagNode node = cleaner.clean(testData[0]);
+        // test to make sure the no-op still works
+        properties.setUseCdataForScriptAndStyle(false);
+        String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
+        assertEquals(testData[0], xmlString);
+
+        // now test actual
+        properties.setUseCdataForScriptAndStyle(true);
+        xmlString = new SimpleXmlSerializer(properties).getAsString(node);
+        assertEquals(testData[1], xmlString);
+    }
+
+    public void testIgnoreClosingCData() throws IOException {
+        String html = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
+                + "<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta http-equiv=\"content-type\" content=\"application/xhtml+xml; charset=utf-8\" /><link href=\"aswa.css\" type=\"text/css\" rel=\"stylesheet\" /><title>ASWA - Events</title>"
+                + "<style type=\"text/css\">/*<![CDATA[*/\r\n"
+                + "#ampmep_188 { }\r\n"
+                + "/*]]>*/</style></head><body></body></html>";
+
+        CleanerProperties properties = new CleanerProperties();
+        properties.setOmitXmlDeclaration(true);
+        properties.setUseCdataForScriptAndStyle(true);
+        properties.setAddNewlineToHeadAndBody(false);
+        properties.setIgnoreQuestAndExclam(false);
+        HtmlCleaner cleaner = new HtmlCleaner(properties);
+        TagNode node = cleaner.clean(html);
+        //properties.setUseCdataForScriptAndStyle(false);
+        String xmlString = new SimpleXmlSerializer(properties).getAsString(node);
+        assertEquals(html, xmlString);
+    }
+
+    public void testTransResCharsToNCR() throws Exception {
+        HtmlCleaner cleaner = new HtmlCleaner();
+        CleanerProperties properties = cleaner.getProperties();
+        String xmlString;
+
+        properties.setNamespacesAware(false);
+        properties.setAdvancedXmlEscape(true);
+        properties.setTransResCharsToNCR(true);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div>1.&#38;&#34;&#39;&#60;&#62;</div>") >= 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div>2.&#38;&#34;&#39;&#60;&#62;</div>") >= 0);
+        properties.setTransResCharsToNCR(false);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div>1.&amp;&quot;&apos;&lt;&gt;</div>") >= 0);
+        xmlString = getXmlString(cleaner, properties);
+        assertTrue(xmlString.indexOf("<div>2.&amp;&quot;&apos;&lt;&gt;</div>") >= 0);
+    }
+}
@@ -0,0 +1,21 @@
+package org.htmlcleaner;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+/**
+ *  Simple test to check that randomly appeared end tags are dropped out.
+ */
+public class RandomCloseTagTest extends TestCase {
+	
+	public void testRandomCloseTagsRemoved() throws IOException{
+		HtmlCleaner cleaner = new HtmlCleaner();
+		CleanerProperties properties = cleaner.getProperties();
+		properties.setOmitHtmlEnvelope(true);
+		properties.setOmitXmlDeclaration(true);
+		SimpleXmlSerializer serializer = new SimpleXmlSerializer(properties);
+		TagNode cleaned = cleaner.clean("Some</span> text </b></div>");
+		assertEquals("Some text ", serializer.getAsString(cleaned));
+	}
+}
@@ -0,0 +1,19 @@
+package org.htmlcleaner;
+
+import java.io.File;
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+public class RandomPageTest extends TestCase {
+	
+	public void testPage() throws IOException {
+	    HtmlCleaner cleaner = new HtmlCleaner();
+            cleaner.clean( new File("src/test/resources/gg_prob.html") );
+	}
+	
+	public void testHtml() throws IOException{
+	    HtmlCleaner cleaner = new HtmlCleaner();
+	    cleaner.clean( new File("src/test/resources/gg_prob_cleaned.html") );
+	}
+}
@@ -0,0 +1,223 @@
+/*  Copyright (c) 2006-2013, the HtmlCleaner Project
+    All rights reserved.
+
+    Redistribution and use of this software in source and binary forms,
+    with or without modification, are permitted provided that the following
+    conditions are met:
+
+    * Redistributions of source code must retain the above
+      copyright notice, this list of conditions and the
+      following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the
+      following disclaimer in the documentation and/or other
+      materials provided with the distribution.
+
+    * The name of HtmlCleaner may not be used to endorse or promote
+      products derived from this software without specific prior
+      written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+*/
+package org.htmlcleaner;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class SVGTest extends AbstractHtmlCleanerTest{
+
+	@Test
+	public void svgTreatedAsPhrasing() throws IOException
+	{
+		CleanerProperties cleanerProperties = new CleanerProperties();
+		cleanerProperties.setOmitXmlDeclaration(false);
+		cleanerProperties.setOmitDoctypeDeclaration(false);
+		cleanerProperties.setIgnoreQuestAndExclam(false);
+		cleanerProperties.setAddNewlineToHeadAndBody(false);
+		cleanerProperties.setUseCdataFor("script,style,altscript");
+		this.cleaner = new HtmlCleaner(cleanerProperties);
+		this.serializer = new SimpleXmlSerializer(cleaner.getProperties());
+
+		assertHTML(
+				"<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\"><circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\" /></svg></p>",
+		"<p><svg xmlns=\"http://www.w3.org/2000/svg\" version=\"1.1\"><circle cx=\"100\" cy=\"50\" fill=\"red\" r=\"40\" stroke=\"black\" stroke-width=\"2\"></circle></svg></p>"
+
+		);
+	}
+	
+	@Test 
+	public void nestedSVG()
+    {
+        String html = "<!DOCTYPE html>\n"
+            + "<html lang=\"en\">\n"
+            + "<head>\n"
+            + "</head>\n"
+            + "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
+            + "<svg xmlns=\"http://www.w3.org/2000/\">\n"
+            + "    <svg></svg>\n"
+            + "</svg>\n"
+            + "</body>\n"
+            + "</html>";
+        new HtmlCleaner().clean(html);
+
+        html = "<!DOCTYPE html>\n"
+            + "<html lang=\"en\">\n"
+            + "<head>\n"
+            + "</head>\n"
+            + "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
+            + "<svg xmlns=\"http://www.w3.org/2000/svg\">\n"
+            + "    <circle cx=\"50\" cy=\"50\" r=\"40\" stroke=\"black\" stroke-width=\"3\" fill=\"red\" />\n"
+            + "</svg>\n"
+            + "</body>\n"
+            + "</html>";
+        new HtmlCleaner().clean(html);
+
+        html = "<!DOCTYPE html>\n"
+            + "<html lang=\"en\">\n"
+            + "<head>\n"
+            + "</head>\n"
+            + "<body itemscope itemtype=\"http://schema.org/WebPage\">\n"
+            + "<svg xmlns=\"http://www.w3.org/2000/svg\">\n"
+            + "    <svg></svg>\n"
+            + "</svg>\n"
+            + "</body>\n"
+            + "</html>";
+        new HtmlCleaner().clean(html);
+    }
+	
+	@Test
+	public void svgCloseAssumedNS4() throws Exception{
+		String html="<html><head></head><body><svg><h3>Title</h3><div>text</div></body></html>";
+		CleanerProperties props = new CleanerProperties();
+	    props.setNamespacesAware(true);
+	    props.setOmitXmlDeclaration(true);
+	    HtmlCleaner cleaner = new HtmlCleaner(props);
+		String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
+		assertEquals("<html><head></head><body><svg></svg><h3>Title</h3><div>text</div></body></html>", cleaned);
+	}
+	
+	@Test
+	@Ignore // This is a tricky one as "a" is allowed in SVG, so the rest is assumed to be OK.
+	public void svgCloseAssumedNS3() throws Exception{
+		String html="<html><head></head><body><svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
+		CleanerProperties props = new CleanerProperties();
+	    props.setNamespacesAware(true);
+	    props.setOmitXmlDeclaration(true);
+	    HtmlCleaner cleaner = new HtmlCleaner(props);
+		String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
+		assertEquals("<html><head></head><body><svg></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
+	}
+	
+	@Test
+	public void svgCloseAssumedNS2() throws Exception{
+		String html="<html><head></head><body><svg><title></title></svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
+		CleanerProperties props = new CleanerProperties();
+	    props.setNamespacesAware(true);
+	    props.setOmitXmlDeclaration(true);
+	    HtmlCleaner cleaner = new HtmlCleaner(props);
+		String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
+		assertEquals("<html><head></head><body><svg><title></title></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
+	}
+	
+	@Test
+	public void svgCloseAssumedNS() throws Exception{
+		String html="<html><head></head><body><svg></svg><a><br><h3>Title</h3><div>text</cite></div></a></body></html>";
+		CleanerProperties props = new CleanerProperties();
+	    props.setNamespacesAware(true);
+	    props.setOmitXmlDeclaration(true);
+	    HtmlCleaner cleaner = new HtmlCleaner(props);
+		String cleaned = new SimpleHtmlSerializer(cleaner.getProperties(), false).getAsString(cleaner.clean(html));
+		assertEquals("<html><head></head><body><svg></svg><a><br /><h3>Title</h3><div>text</div></a></body></html>", cleaned);
+	}
+	
+	@Test
+	public void missingSVGNamespace() throws IOException {
+		String initial = "<html><head><title>Title of document</title></head><body><svg><title>A big circle.</title></svg></body></html>";
+		String expected = "<html>\n<head><title>Title of document</title></head>\n<body><svg><title>A big circle.</title></svg></body></html>";
+		assertCleaned(initial, expected);
+	}
+
+	@Test
+	public void preserveSVGtags() throws IOException{
+		
+        cleaner.getProperties().setOmitXmlDeclaration(false);
+        cleaner.getProperties().setOmitDoctypeDeclaration(false);
+        cleaner.getProperties().setOmitUnknownTags(true);
+        cleaner.getProperties().setNamespacesAware(true);
+        
+		String initial = readFile("src/test/resources/test18.html");
+		String expected = readFile("src/test/resources/test18_expected.html"); 
+		
+		assertCleaned(initial,expected);
+	}
+	
+	@Test
+	public void preserveSVGtags2() throws IOException{
+		
+        cleaner.getProperties().setOmitXmlDeclaration(false);
+        cleaner.getProperties().setOmitDoctypeDeclaration(false);
+        cleaner.getProperties().setOmitUnknownTags(true);
+        cleaner.getProperties().setNamespacesAware(true);
+        
+		String initial = readFile("src/test/resources/test19.html");
+		String expected = readFile("src/test/resources/test19_expected.html"); 
+		assertCleaned(initial,expected);
+	}
+
+	
+	@Test
+	public void preserveSVGtags3() throws IOException{
+		
+        cleaner.getProperties().setOmitXmlDeclaration(false);
+        cleaner.getProperties().setOmitDoctypeDeclaration(false);
+        cleaner.getProperties().setNamespacesAware(true);
+        
+		String initial = readFile("src/test/resources/test20.html");
+		String expected = readFile("src/test/resources/test20_expected.html"); 
+
+		assertCleaned(initial,expected);
+	}
+	
+	@Test
+	public void preserveSVGtagsWithTitle() throws IOException{
+		
+        cleaner.getProperties().setOmitXmlDeclaration(false);
+        cleaner.getProperties().setOmitDoctypeDeclaration(false);
+        cleaner.getProperties().setNamespacesAware(true);
+        cleaner.getProperties().setOmitUnknownTags(true);
+        
+		String initial = readFile("src/test/resources/test21.html");
+		String expected = readFile("src/test/resources/test21_expected.html"); 
+
+		assertCleaned(initial,expected);
+	}
+	
+	@Test
+	public void preserveSVGstylesInPlace() throws IOException{
+		
+        cleaner.getProperties().setOmitXmlDeclaration(false);
+        cleaner.getProperties().setOmitDoctypeDeclaration(false);
+        cleaner.getProperties().setNamespacesAware(true);
+        cleaner.getProperties().setOmitUnknownTags(true);
+        
+		String initial = readFile("src/test/resources/test25.html");
+		String expected = readFile("src/test/resources/test25_expected.html"); 
+
+		assertCleaned(initial,expected);
+	}
+}
@@ -0,0 +1,77 @@
+package org.htmlcleaner;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+
+/**
+ * Tests for some common use of <script> tags within <head> elements
+ * @author scottw
+ *
+ */
+public class ScriptTest extends AbstractHtmlCleanerTest {
+	
+	@Test
+	public void another() throws IOException{
+		HtmlCleaner htmlCleaner = new HtmlCleaner();
+		CleanerProperties props = htmlCleaner.getProperties();
+		props.setAllowHtmlInsideAttributes(true);
+		props.setAllowMultiWordAttributes(true);
+		props.setRecognizeUnicodeChars(true);
+		props.setOmitComments(true);
+		TagNode rootNode = htmlCleaner.clean(new File("src/test/resources/script_test.html"));
+	}
+	
+	
+	@Test
+	public void getScripts() throws IOException{
+	    HtmlCleaner cleaner = new HtmlCleaner();
+        TagNode html = cleaner.clean( new File("src/test/resources/script_test.html") );
+        TagNode head = html.findElementByName("head", false);
+        
+        ArrayList<TagNode> scripts = new ArrayList<TagNode>();
+		List<TagNode> children = head.getChildTagList();	
+		
+		for(TagNode child : children){						
+			if(child.getName().equals("script")){				
+				scripts.add(child);
+			}			
+		}
+		assertEquals(3, scripts.size());
+		assertEquals("x.js", scripts.get(0).getAttributeByName("src"));
+		assertEquals("y.js", scripts.get(1).getAttributeByName("src"));
+		assertEquals("z.js", scripts.get(2).getAttributeByName("src"));
+
+	}
+	
+	@Test
+	public void scriptAttribute() throws IOException{
+		cleaner.getProperties().setUseCdataForScriptAndStyle(true);
+		String initial = "<button onclick='aaa(\"bbb\")'>Click here!</button>";
+		String expected ="<html>\n<head />\n<body><button onclick=\"aaa(&quot;bbb&quot;)\">Click here!</button></body></html>";
+		assertCleaned(initial, expected);
+	}
+	
+	/*
+	 * Test for issue #88 - thanks to Serge Dyomin
+	 */
+	@Test
+	public void scriptAttributeQuotes() throws IOException{
+		 HtmlCleaner thecleaner=new HtmlCleaner();
+         CleanerProperties props = thecleaner.getProperties();
+         props.setOmitXmlDeclaration(true);
+         props.setOmitComments(false);  
+         props.setTranslateSpecialEntities(true);  
+         
+        String initial = readFile("src/test/resources/test16.html");
+        String expected = readFile("src/test/resources/test16_expected.html"); 
+        String output = new SimpleHtmlSerializer(thecleaner.getProperties()).getAsString(thecleaner.clean(initial));
+        
+        assertEquals(expected,output);
+	}
+}
--- a/Show More
+++ b/Show More