2010年5月1日星期六

使用TidyNET清理和转换HTML为XHTML

TidyNet(http://sourceforge.net/projects/tidynet/)是用c#编写的Tidy API,对字符支持比较好,不会出现像NTidy那样的乱码。

Tidy doc = new Tidy();
TidyMessageCollection tmc = new TidyMessageCollection();

            MemoryStream input = new MemoryStream();
            MemoryStream output = new MemoryStream();

            //Set some Tidy options, refer to the HTML Tidy docs for more info
            doc.Options.DocType = DocType.Strict;
            doc.Options.Xhtml = false;
            doc.Options.LogicalEmphasis = true;
            doc.Options.DropFontTags = true;
            doc.Options.DropEmptyParas = true;
            doc.Options.QuoteAmpersand = true;
            doc.Options.TidyMark = true;
            doc.Options.MakeClean = true;
            doc.Options.IndentContent = true;
            doc.Options.SmartIndent = true;
            doc.Options.Spaces = 4;
            doc.Options.WrapLen = 100;
            doc.Options.CharEncoding = CharEncoding.UTF8;
            doc.Options.XmlOut = true;

            byte[] byteArray = System.Text.Encoding.UTF8.GetBytes(restext);
            input.Write(byteArray, 0, byteArray.Length);
            input.Position = 0;
            doc.Parse(input, output, tmc);
            foreach (TidyMessage message in tmc)
            {
                if (message.Level == MessageLevel.Error)
                {
                    throw new ApplicationException(String.Format("{0} at line {1} column {2}",
                    message.Message, message.Line,
                    message.Column));
                }
            }

            string cleanedMarkUp = System.Text.Encoding.UTF8.GetString(output.ToArray());

没有评论:

发表评论