TidyNet(http://sourceforge.net/projects/tidynet/)是用c#编写的Tidy API,对字符支持比较好,不会出现像NTidy那样的乱码。
Tidy doc = new Tidy();
TidyMessageCollection tmc = new TidyMessageCollection();
MemoryStream input = new MemoryStream();
MemoryStream output = new MemoryStream();
//Set some Tidy options, refer to the HTML Tidy docs for more info
doc.Options.DocType = DocType.Strict;
doc.Options.Xhtml = false;
doc.Options.LogicalEmphasis = true;
doc.Options.DropFontTags = true;
doc.Options.DropEmptyParas = true;
doc.Options.QuoteAmpersand = true;
doc.Options.TidyMark = true;
doc.Options.MakeClean = true;
doc.Options.IndentContent = true;
doc.Options.SmartIndent = true;
doc.Options.Spaces = 4;
doc.Options.WrapLen = 100;
doc.Options.CharEncoding = CharEncoding.UTF8;
doc.Options.XmlOut = true;
byte[] byteArray = System.Text.Encoding.UTF8.GetBytes(restext);
input.Write(byteArray, 0, byteArray.Length);
input.Position = 0;
doc.Parse(input, output, tmc);
foreach (TidyMessage message in tmc)
{
if (message.Level == MessageLevel.Error)
{
throw new ApplicationException(String.Format("{0} at line {1} column {2}",
message.Message, message.Line,
message.Column));
}
}
string cleanedMarkUp = System.Text.Encoding.UTF8.GetString(output.ToArray());
没有评论:
发表评论