HTML解析類，讓你不使用正則也能輕松獲取HTML相關元素 -C# .NET

2019-11-17 02:24:24

字體：大中小

來源：轉載

供稿：網友

HTML解析類，讓你不使用正則也能輕松獲取HTML相關元素 -C# .NET

功能：

1、輕松獲取指元素HTML元素。

2、可以根據屬性標簽進行篩選

3、返回的都是Llist強類型無需轉換

用過XElement的都知道用來解析xml非常的方便，但是對于HTML的格式多樣化實在是沒辦法兼容。

所以我就寫了這么一個類似XElement的 XHTMLElement

用法：

            string filePath = Server.MapPath("~/file/test.htm");            //獲取HTML代碼            string mailBody = FileHelper.FileToString(filePath);            XHtmlElement xh = new XHtmlElement(mailBody);            //獲取body的子集a標簽并且class="icon"            var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();            //獲取帶href的a元素            var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();            foreach (var r in links)            {                Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出輸href            }            //獲取第一個img            var img = xh.Descendants("img");            //獲取最近的第一個p元素以及與他同一級的其它p元素            var ps = xh.Descendants("p");

代碼：

using System;using System.Collections.Generic;using System.Linq;using System.Web;using System.Text;using System.Text.RegularExPRessions;namespace SyntacticSugar{    /// <summary>    /// ** 描述：html解析類    /// ** 創始時間：2015-4-23    /// ** 修改時間：-    /// ** 作者：sunkaixuan    /// ** QQ：610262374 歡迎交流,共同提高 ,命名語法等寫的不好的地方歡迎大家的給出寶貴建議    /// </summary>    public class XHtmlElement    {        private string _html;        public XHtmlElement(string html)        {            _html = html;        }        /// <summary>        /// 獲取最近的相同層級的HTML元素        /// </summary>        /// <param name="elementName">等于null為所有元素</param>        /// <returns></returns>        public List<HtmlInfo> Descendants(string elementName = null)        {            if (_html == null)            {                throw new ArgumentNullException("html不能這空！");            }            var allList = RootDescendants(_html);            var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();            if (reval == null || reval.Count == 0)            {                reval = GetDescendantsSource(allList, elementName);            }            return reval;        }        /// <summary>        /// 獲取第一級元素        /// </summary>        /// <param name="elementName"></param>        /// <returns></returns>        public List<HtmlInfo> RootDescendants(string html = null)        {            /*             * 業務邏輯:                         * 1、獲取第一個html標簽一直找結尾標簽，如果在這個過程中遇到相同的標簽收尾標簽就要加1                         * 2、第一個標簽取到后繼續第一步操作，找第2個元素 。。第N個元素             */            if (html == null) html = _html;            var firstTag = Regex.Match(html, "<.+?>");            List<string> eleList = new List<string>();            List<HtmlInfo> reval = new List<HtmlInfo>();            GetElementsStringList(html, ref eleList);            foreach (var r in eleList)            {                HtmlInfo data = new HtmlInfo();                data.OldFullHtml = r;                data.SameLeveHtml = html;                data.TagName = Regex.Match(r, @"(?<=/s{1}|/<)[a-z,A-Z]+(?=/>|/s)", RegexOptions.IgnoreCase).Value;                data.InnerHtml = Regex.Match(r, @"(?<=/>).+(?=<)", RegexOptions.Singleline).Value;                var eleBegin = Regex.Match(r, "<.+?>").Value;                var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+/="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();                data.Attributes = new Dictionary<string, string>();                if (attrList != null && attrList.Count > 0)                {                    foreach (var a in attrList)                    {                        data.Attributes.Add(a.key, a.value);                    }                }                reval.Add(data);            }            return reval;        }        #region private        private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName)        {            foreach (var r in allList)            {                if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;                var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();                if (childList == null || childList.Count == 0)                {                    childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);                    if (childList != null && childList.Count > 0)                        return childList;                }                else                {                    return childList;                }            }            return null;        }        private void GetElementsStringList(string html, ref List<string> eleList)        {            HtmlInfo info = new HtmlInfo();            info.TagName = Regex.Match(html, @"(?<=/</s{0,5}|/<)([a-z,A-Z]+|h/d{1})(?=/>|/s)", RegexOptions.IgnoreCase).Value;            string currentTagBeginReg = @"</s{0,10}" + info.TagName + @".*?>";//獲取當前標簽元素開始標簽正則            string currentTagEndReg = @"/<//" + info.TagName + @"/>";//獲取當前標簽元素收尾標簽正則            if (string.IsNullOrEmpty(info.TagName)) return;            string eleHtml = "";            //情況1 <a/>            //情況2 <a></a>            //情況3 <a> 錯誤格式            //情況4endif            if (Regex.IsMatch(html, @"</s{0,10}" + info.TagName + "[^<].*?/>"))//單標簽            {                eleHtml = Regex.Match(html, @"</s{0,10}" + info.TagName + "[^<].*?/>").Value;            }            else if (!Regex.IsMatch(html, currentTagEndReg))//沒有收尾            {                if (Regex.IsMatch(html, @"/s{0,10}/</!/-/-/[if"))                {                    eleHtml = GetElementString(html, @"/s{0,10}/</!/-/-/[if", @"/[endif/]/-/-/>", 1);                }                else                {                    eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;                }            }            else            {                eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);            }            try            {                eleList.Add(eleHtml);                html = html.Replace(eleHtml, "");                html = Regex.Replace(html, @"</!DOCTYPE.*?>", "");                if (!Regex.IsMatch(html, @"^/s*$"))                {                    GetElementsStringList(html, ref eleList);                }            }            catch (Exception ex)            {                throw new Exception("SORRY,您的HTML格式不能解析！！！");            }        }        private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)        {            string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);            var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList();            var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList();            if (currentTagBeginMatches.Count == currentTagEndMatches.Count)            { //兩個簽標元素相等                return newHtml;            }            return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);        }        private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)        {            return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;        }        #endregion    }    public static class XHtmlElementExtendsion    {        /// <summa

上一篇：.NET C# 使用S22.Imap.dll接收郵件并且指定收取的文件夾的未讀郵件，并且更改未讀準態

下一篇：C#string類型總結