C#---HTML 转文本及HTML内容提取
生活随笔
收集整理的這篇文章主要介紹了
C#---HTML 转文本及HTML内容提取
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
//1、HTML直接轉(zhuǎn)文本//使用方法 HtmlToText convert = new HtmlToText(); textBox2.Text = convert.Convert(textBox1.Text);//代碼 /// <summary> /// Converts HTML to plain text. /// </summary> class HtmlToText {// Static data tablesprotected static Dictionary<string, string> _tags;protected static HashSet<string> _ignoreTags;// Instance variablesprotected TextBuilder _text;protected string _html;protected int _pos;// Static constructor (one time only)static HtmlToText(){_tags = new Dictionary<string, string>();_tags.Add("address", "\n");_tags.Add("blockquote", "\n");_tags.Add("div", "\n");_tags.Add("dl", "\n");_tags.Add("fieldset", "\n");_tags.Add("form", "\n");_tags.Add("h1", "\n");_tags.Add("/h1", "\n");_tags.Add("h2", "\n");_tags.Add("/h2", "\n");_tags.Add("h3", "\n");_tags.Add("/h3", "\n");_tags.Add("h4", "\n");_tags.Add("/h4", "\n");_tags.Add("h5", "\n");_tags.Add("/h5", "\n");_tags.Add("h6", "\n");_tags.Add("/h6", "\n");_tags.Add("p", "\n");_tags.Add("/p", "\n");_tags.Add("table", "\n");_tags.Add("/table", "\n");_tags.Add("ul", "\n");_tags.Add("/ul", "\n");_tags.Add("ol", "\n");_tags.Add("/ol", "\n");_tags.Add("/li", "\n");_tags.Add("br", "\n");_tags.Add("/td", "\t");_tags.Add("/tr", "\n");_tags.Add("/pre", "\n");_ignoreTags = new HashSet<string>();_ignoreTags.Add("script");_ignoreTags.Add("noscript");_ignoreTags.Add("style");_ignoreTags.Add("object");}/// <summary>/// Converts the given HTML to plain text and returns the result./// </summary>/// <param name="html">HTML to be converted</param>/// <returns>Resulting plain text</returns>public string Convert(string html){// Initialize state variables_text = new TextBuilder();_html = html;_pos = 0;// Process inputwhile (!EndOfText){if (Peek() == '<'){// HTML tagbool selfClosing;string tag = ParseTag(out selfClosing);// Handle special tag casesif (tag == "body"){// Discard content before <body>_text.Clear();}else if (tag == "/body"){// Discard content after </body>_pos = _html.Length;}else if (tag == "pre"){// Enter preformatted mode_text.Preformatted = true;EatWhitespaceToNextLine();}else if (tag == "/pre"){// Exit preformatted mode_text.Preformatted = false;}string value;if (_tags.TryGetValue(tag, out value))_text.Write(value);if (_ignoreTags.Contains(tag))EatInnerContent(tag);}else if (Char.IsWhiteSpace(Peek())){// Whitespace (treat all as space)_text.Write(_text.Preformatted ? Peek() : ' ');MoveAhead();}else{// Other text_text.Write(Peek());MoveAhead();}}// Return resultreturn HttpUtility.HtmlDecode(_text.ToString());}// Eats all characters that are part of the current tag// and returns information about that tagprotected string ParseTag(out bool selfClosing){string tag = String.Empty;selfClosing = false;if (Peek() == '<'){MoveAhead();// Parse tag nameEatWhitespace();int start = _pos;if (Peek() == '/')MoveAhead();while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&Peek() != '/' && Peek() != '>')MoveAhead();tag = _html.Substring(start, _pos - start).ToLower();// Parse rest of tagwhile (!EndOfText && Peek() != '>'){if (Peek() == '"' || Peek() == '\'')EatQuotedValue();else{if (Peek() == '/')selfClosing = true;MoveAhead();}}MoveAhead();}return tag;}// Consumes inner content from the current tagprotected void EatInnerContent(string tag){string endTag = "/" + tag;while (!EndOfText){if (Peek() == '<'){// Consume a tagbool selfClosing;if (ParseTag(out selfClosing) == endTag)return;// Use recursion to consume nested tagsif (!selfClosing && !tag.StartsWith("/"))EatInnerContent(tag);}else MoveAhead();}}// Returns true if the current position is at the end of// the stringprotected bool EndOfText{get { return (_pos >= _html.Length); }}// Safely returns the character at the current positionprotected char Peek(){return (_pos < _html.Length) ? _html[_pos] : (char)0;}// Safely advances to current position to the next characterprotected void MoveAhead(){_pos = Math.Min(_pos + 1, _html.Length);}// Moves the current position to the next non-whitespace// character.protected void EatWhitespace(){while (Char.IsWhiteSpace(Peek()))MoveAhead();}// Moves the current position to the next non-whitespace// character or the start of the next line, whichever// comes firstprotected void EatWhitespaceToNextLine(){while (Char.IsWhiteSpace(Peek())){char c = Peek();MoveAhead();if (c == '\n')break;}}// Moves the current position past a quoted valueprotected void EatQuotedValue(){char c = Peek();if (c == '"' || c == '\''){// Opening quoteMoveAhead();// Find end of valueint start = _pos;_pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);if (_pos < 0)_pos = _html.Length;elseMoveAhead(); // Closing quote}}/// <summary>/// A StringBuilder class that helps eliminate excess whitespace./// </summary>protected class TextBuilder{private StringBuilder _text;private StringBuilder _currLine;private int _emptyLines;private bool _preformatted;// Constructionpublic TextBuilder(){_text = new StringBuilder();_currLine = new StringBuilder();_emptyLines = 0;_preformatted = false;}/// <summary>/// Normally, extra whitespace characters are discarded./// If this property is set to true, they are passed/// through unchanged./// </summary>public bool Preformatted{get{return _preformatted;}set{if (value){// Clear line buffer if changing to// preformatted modeif (_currLine.Length > 0)FlushCurrLine();_emptyLines = 0;}_preformatted = value;}}/// <summary>/// Clears all current text./// </summary>public void Clear(){_text.Length = 0;_currLine.Length = 0;_emptyLines = 0;}/// <summary>/// Writes the given string to the output buffer./// </summary>/// <param name="s"></param>public void Write(string s){foreach (char c in s)Write(c);}/// <summary>/// Writes the given character to the output buffer./// </summary>/// <param name="c">Character to write</param>public void Write(char c){if (_preformatted){// Write preformatted character_text.Append(c);}else{if (c == '\r'){// Ignore carriage returns. We'll process// '\n' if it comes next}else if (c == '\n'){// Flush current lineFlushCurrLine();}else if (Char.IsWhiteSpace(c)){// Write single space characterint len = _currLine.Length;if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))_currLine.Append(' ');}else{// Add character to current line_currLine.Append(c);}}}// Appends the current line to output bufferprotected void FlushCurrLine(){// Get current linestring line = _currLine.ToString().Trim();// Determine if line contains non-space charactersstring tmp = line.Replace(" ", String.Empty);if (tmp.Length == 0){// An empty line_emptyLines++;if (_emptyLines < 2 && _text.Length > 0)_text.AppendLine(line);}else{// A non-empty line_emptyLines = 0;_text.AppendLine(line);}// Reset current line_currLine.Length = 0;}/// <summary>/// Returns the current output as a string./// </summary>public override string ToString(){if (_currLine.Length > 0)FlushCurrLine();return _text.ToString();}} }//2、提取html的正文 類 using System;using System.Text;namespace HtmlStrip{class MainClass{public static void Main (string[] args){string str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo";//System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");//str=rd.ReadToEnd ();HtmlParser t = new HtmlParser (str); //t.KeepTag (new string[] { "br" }); //設(shè)置br標(biāo)簽不過慮Console.Write (t.Text ());}}class HtmlParser{private string[] htmlcode; //把html轉(zhuǎn)為數(shù)組形式用于分析private StringBuilder result = new StringBuilder (); //輸出的結(jié)果private int seek; //分析文本時候的指針位置private string[] keepTag; //用于保存要保留的尖括號內(nèi)容private bool _inTag; //標(biāo)記現(xiàn)在的指針是不是在尖括號內(nèi)private bool needContent = true; //是否要提取正文private string tagName; //當(dāng)前尖括號的名字private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括號內(nèi)容,一般這些標(biāo)簽的正文是不要的/// <summary>/// 當(dāng)指針進(jìn)入尖括號內(nèi),就會觸發(fā)這個屬性。這里主要邏輯是提取尖括號里的標(biāo)簽名字/// </summary>public bool inTag {get { return _inTag; }set {_inTag = value;if (!value)return;bool ok = true;tagName = "";while (ok) {string word = read ();if (word != " " && word != ">") {tagName += word;} else if (word == " " && tagName.Length > 0) {ok = false;} else if (word == ">") {ok = false;inTag = false;seek -= 1;}}}}/// <summary>/// 初始化類/// </summary>/// <param name="html">/// 要分析的html代碼/// </param>public HtmlParser (string html){htmlcode = new string[html.Length];for (int i = 0; i < html.Length; i++) {htmlcode[i] = html[i].ToString ();}KeepTag (new string[] { });}/// <summary>/// 設(shè)置要保存那些標(biāo)簽不要被過濾掉/// </summary>/// <param name="tags">////// </param>public void KeepTag (string[] tags){keepTag = tags;}/// <summary>/// /// </summary>/// <returns>/// 輸出處理后的文本/// </returns>public string Text (){int startTag = 0;int endTag = 0;while (seek < htmlcode.Length) {string word = read ();if (word.ToLower () == "<") {startTag = seek;inTag = true;} else if (word.ToLower () == ">") {endTag = seek;inTag = false;if (iskeepTag (tagName.Replace ("/", ""))) {for (int i = startTag - 1; i < endTag; i++) {result.Append (htmlcode[i].ToString ());}} else if (tagName.StartsWith ("!--")) {bool ok = true;while (ok) {if (read () == "-") {if (read () == "-") {if (read () == ">") {ok = false;} else {seek -= 1;}}}}} else {foreach (string str in specialTag) {if (tagName == str) {needContent = false;break;} elseneedContent = true;}}} else if (!inTag && needContent) {result.Append (word);}}return result.ToString ();}/// <summary>/// 判斷是否要保存這個標(biāo)簽/// </summary>/// <param name="tag">/// A <see cref="System.String"/>/// </param>/// <returns>/// A <see cref="System.Boolean"/>/// </returns>private bool iskeepTag (string tag){foreach (string ta in keepTag) {if (tag.ToLower () == ta.ToLower ()) {return true;}}return false;}private string read (){return htmlcode[seek++];}}}
===========該文轉(zhuǎn)自=========
http://blog.csdn.net/cjh200102/article/details/6824895#
================================
轉(zhuǎn)載于:https://www.cnblogs.com/zjw520/archive/2013/04/11/3014848.html
總結(jié)
以上是生活随笔為你收集整理的C#---HTML 转文本及HTML内容提取的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 成功创业者必须具备的9个特质
- 下一篇: Objective-C 文件夹操作