Csharp/C#教程:C#实现将HTML转换成纯文本的方法分享

本文实例讲述了C#实现将HTML转换成纯文本的方法。分享给大家供大家参考。具体如下:

使用方法:
代码如下:HtmlToTextconvert=newHtmlToText();
textBox2.Text=convert.Convert(textBox1.Text);

C#代码如下:

///<summary> ///ConvertsHTMLtoplaintext. ///</summary> classHtmlToText { //Staticdatatables protectedstaticDictionary<string,string>_tags; protectedstaticHashSet<string>_ignoreTags; //Instancevariables protectedTextBuilder_text; protectedstring_html; protectedint_pos; //Staticconstructor(onetimeonly) staticHtmlToText() { _tags=newDictionary<string,string>(); _tags.Add("address","n"); _tags.Add("blockquote","n"); _tags.Add("div","n"); _tags.Add("dl","n"); _tags.Add("fieldset","n"); _tags.Add("form","n"); _tags.Add("h1","n"); _tags.Add("/h1","n"); _tags.Add("h2","n"); _tags.Add("/h2","n"); _tags.Add("h3","n"); _tags.Add("/h3","n"); _tags.Add("h4","n"); _tags.Add("/h4","n"); _tags.Add("h5","n"); _tags.Add("/h5","n"); _tags.Add("h6","n"); _tags.Add("/h6","n"); _tags.Add("p","n"); _tags.Add("/p","n"); _tags.Add("table","n"); _tags.Add("/table","n"); _tags.Add("ul","n"); _tags.Add("/ul","n"); _tags.Add("ol","n"); _tags.Add("/ol","n"); _tags.Add("/li","n"); _tags.Add("br","n"); _tags.Add("/td","t"); _tags.Add("/tr","n"); _tags.Add("/pre","n"); _ignoreTags=newHashSet<string>(); _ignoreTags.Add("script"); _ignoreTags.Add("noscript"); _ignoreTags.Add("style"); _ignoreTags.Add("object"); } ///<summary> ///ConvertsthegivenHTMLtoplaintextandreturnstheresult. ///</summary> ///<paramname="html">HTMLtobeconverted</param> ///<returns>Resultingplaintext</returns> publicstringConvert(stringhtml) { //Initializestatevariables _text=newTextBuilder(); _html=html; _pos=0; //Processinput while(!EndOfText) { if(Peek()=='<') { //HTMLtag boolselfClosing; stringtag=ParseTag(outselfClosing); //Handlespecialtagcases if(tag=="body") { //Discardcontentbefore<body> _text.Clear(); } elseif(tag=="/body") { //Discardcontentafter</body> _pos=_html.Length; } elseif(tag=="pre") { //Enterpreformattedmode _text.Preformatted=true; EatWhitespaceToNextLine(); } elseif(tag=="/pre") { //Exitpreformattedmode _text.Preformatted=false; } stringvalue; if(_tags.TryGetValue(tag,outvalue)) _text.Write(value); if(_ignoreTags.Contains(tag)) EatInnerContent(tag); } elseif(Char.IsWhiteSpace(Peek())) { //Whitespace(treatallasspace) _text.Write(_text.Preformatted?Peek():''); MoveAhead(); } else { //Othertext _text.Write(Peek()); MoveAhead(); } } //Returnresult returnHttpUtility.HtmlDecode(_text.ToString()); } //Eatsallcharactersthatarepartofthecurrenttag //andreturnsinformationaboutthattag protectedstringParseTag(outboolselfClosing) { stringtag=String.Empty; selfClosing=false; if(Peek()=='<') { MoveAhead(); //Parsetagname EatWhitespace(); intstart=_pos; if(Peek()=='/') MoveAhead(); while(!EndOfText&&!Char.IsWhiteSpace(Peek())&& Peek()!='/'&&Peek()!='>') MoveAhead(); tag=_html.Substring(start,_pos-start).ToLower(); //Parserestoftag while(!EndOfText&&Peek()!='>') { if(Peek()=='"'||Peek()==''') EatQuotedValue(); else { if(Peek()=='/') selfClosing=true; MoveAhead(); } } MoveAhead(); } returntag; } //Consumesinnercontentfromthecurrenttag protectedvoidEatInnerContent(stringtag) { stringendTag="/"+tag; while(!EndOfText) { if(Peek()=='<') { //Consumeatag boolselfClosing; if(ParseTag(outselfClosing)==endTag) return; //Userecursiontoconsumenestedtags if(!selfClosing&&!tag.StartsWith("/")) EatInnerContent(tag); } elseMoveAhead(); } } //Returnstrueifthecurrentpositionisattheendof //thestring protectedboolEndOfText { get{return(_pos>=_html.Length);} } //Safelyreturnsthecharacteratthecurrentposition protectedcharPeek() { return(_pos<_html.Length)?_html[_pos]:(char)0; } //Safelyadvancestocurrentpositiontothenextcharacter protectedvoidMoveAhead() { _pos=Math.Min(_pos+1,_html.Length); } //Movesthecurrentpositiontothenextnon-whitespace //character. protectedvoidEatWhitespace() { while(Char.IsWhiteSpace(Peek())) MoveAhead(); } //Movesthecurrentpositiontothenextnon-whitespace //characterorthestartofthenextline,whichever //comesfirst protectedvoidEatWhitespaceToNextLine() { while(Char.IsWhiteSpace(Peek())) { charc=Peek(); MoveAhead(); if(c=='n') break; } } //Movesthecurrentpositionpastaquotedvalue protectedvoidEatQuotedValue() { charc=Peek(); if(c=='"'||c==''') { //Openingquote MoveAhead(); //Findendofvalue intstart=_pos; _pos=_html.IndexOfAny(newchar[]{c,'r','n'},_pos); if(_pos<0) _pos=_html.Length; else MoveAhead();//Closingquote } } ///<summary> ///AStringBuilderclassthathelpseliminateexcesswhitespace. ///</summary> protectedclassTextBuilder { privateStringBuilder_text; privateStringBuilder_currLine; privateint_emptyLines; privatebool_preformatted; //Construction publicTextBuilder() { _text=newStringBuilder(); _currLine=newStringBuilder(); _emptyLines=0; _preformatted=false; } ///<summary> ///Normally,extrawhitespacecharactersarediscarded. ///Ifthispropertyissettotrue,theyarepassed ///throughunchanged. ///</summary> publicboolPreformatted { get { return_preformatted; } set { if(value) { //Clearlinebufferifchangingto //preformattedmode if(_currLine.Length>0) FlushCurrLine(); _emptyLines=0; } _preformatted=value; } } ///<summary> ///Clearsallcurrenttext. ///</summary> publicvoidClear() { _text.Length=0; _currLine.Length=0; _emptyLines=0; } ///<summary> ///Writesthegivenstringtotheoutputbuffer. ///</summary> ///<paramname="s"></param> publicvoidWrite(strings) { foreach(charcins) Write(c); } ///<summary> ///Writesthegivencharactertotheoutputbuffer. ///</summary> ///<paramname="c">Charactertowrite</param> publicvoidWrite(charc) { if(_preformatted) { //Writepreformattedcharacter _text.Append(c); } else { if(c=='r') { //Ignorecarriagereturns.We'llprocess //'n'ifitcomesnext } elseif(c=='n') { //Flushcurrentline FlushCurrLine(); } elseif(Char.IsWhiteSpace(c)) { //Writesinglespacecharacter intlen=_currLine.Length; if(len==0||!Char.IsWhiteSpace(_currLine[len-1])) _currLine.Append(''); } else { //Addcharactertocurrentline _currLine.Append(c); } } } //Appendsthecurrentlinetooutputbuffer protectedvoidFlushCurrLine() { //Getcurrentline stringline=_currLine.ToString().Trim(); //Determineiflinecontainsnon-spacecharacters stringtmp=line.Replace(" ",String.Empty); if(tmp.Length==0) { //Anemptyline _emptyLines++; if(_emptyLines<2&&_text.Length>0) _text.AppendLine(line); } else { //Anon-emptyline _emptyLines=0; _text.AppendLine(line); } //Resetcurrentline _currLine.Length=0; } ///<summary> ///Returnsthecurrentoutputasastring. ///</summary> publicoverridestringToString() { if(_currLine.Length>0) FlushCurrLine(); return_text.ToString(); } } }

希望本文所述对大家的C#程序设计有所帮助。

您可能感兴趣的文章:ASP.net(c#)生成html的几种解决方案[思路]C#将htmltable导出成excel实例C#下解析HTML的两种方法介绍使用C#获取网页HTML源码的例子asp.net(C#)动态添加非ASP的标准html控件(如添加Script标签)C#导出生成excel文件的方法小结(xml,html方式)c#中过滤html的正则表达式C#正则表达式匹配HTML中的图片路径,图片地址代码C#实现下载网页HTML源码的方法C#获取HTML文本的第一张图片与截取内容摘要示例代码

标签: 方法

C++实现机票预订系统

C++11/14 线程调用类对象和线程传参的方法

上述就是C#学习教程:C#实现将HTML转换成纯文本的方法分享的全部内容,如果对大家有所用处且需要了解更多关于C#学习教程,希望大家多多关注—计算机技术网(www.ctvol.com)!

本文来自网络收集,不代表计算机技术网立场,如涉及侵权请联系管理员删除。

ctvol管理联系方式QQ:251552304

本文章地址:https://www.ctvol.com/cdevelopment/907314.html

(0)
上一篇 2021年10月24日
下一篇 2021年10月24日

精彩推荐