皇家社会vs莱万特
聊城萬拓網絡科技-專業聊城網站建設、聊城網站制作、聊城網站優化、聊城做網站的品牌網站建設專家!

    您當前的位置是:首頁 - 新聞動態 - 網站建設 ? 一段C#抓取百度、谷歌、搜狗、360等搜索引擎結果的代碼

    一段C#抓取百度、谷歌、搜狗、360等搜索引擎結果的代碼
     發布時間:2014-03-26  點擊次數: 次   作者:萬拓網絡  來源:lcbaituo.com  Tags:

    最近做了個項目就是抓取百度、谷歌、搜狗、360等搜索引擎結果的搜索結果,把搜索到的標題和鏈接一一提取出來。其實頁面是很好提取的,主要的問題就是正則表達式處理下載下來的頁面。于是在論壇上請教了大家,在大家的幫助下,這個功能的核心代碼已經完成,現在整理出來,以提供需要的人參考。

    C# 代碼:

    using System;
    using System.Collections.Generic;
    using System.ComponentModel;
    using System.Data;
    using System.Drawing;
    using System.Linq;
    using System.Text;
    using System.Windows.Forms;
    using System.Net;
    using System.IO;
    using httpState;
    using System.Text.RegularExpressions;
    using System.Collections;

    namespace test
    {
        public partial class DownLoadTest : Form
        {
           
            public DownLoadTest()
            {
                InitializeComponent();
            }
            /// <summary>
            /// 百度搜索
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void btnBaidu_Click(object sender, EventArgs e)
            {       
                int num = 20;//搜索條數
                string url = "http://www.baidu.com/s?wd=" + txtSearch.Text.Trim() + "&rn=" + num + "";
                string html=search(url,"gb2312");
                BaiduSearch baidu = new BaiduSearch();
                if (!string.IsNullOrEmpty(html))
                {
                    int count = baidu.GetSearchCount(html);//搜索條數
                    if (count > 0)
                    {
                        List<Keyword> keywords = baidu.GetKeywords(html, txtSearch.Text.Trim());
                        dataGridView1.DataSource = keywords;
                    }
                  
                }
            }
            /// <summary>
            /// 谷歌搜索
            /// </summary>
            /// <param name="sender"></param>
            /// <param name="e"></param>
            private void button2_Click(object sender, EventArgs e)
            {
                int num=100;
                string url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=" + txtSearch.Text.Trim() + "&aq=f&aqi=&aql=&oq=&num="+num+"";
                string html=search(url,"utf-8");
                if (!string.IsNullOrEmpty(html))
                {

                    googleSearch google = new googleSearch();
                    List<Keyword> keywords = google.GetKeywords(html, txtSearch.Text.Trim());
                        dataGridView1.DataSource = keywords;
                   
                }
            }
            /// <summary>
            /// 搜索處理
            /// </summary>
            /// <param name="url">搜索網址</param>
            /// <param name="Chareset">編碼</param>
            public string search(string url,string Chareset)
            {
                HttpState result = new HttpState();
                Uri uri = new Uri(url);
                HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
                myHttpWebRequest.UseDefaultCredentials = true;
                myHttpWebRequest.ContentType = "text/html";
                myHttpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215;)";
                myHttpWebRequest.Method = "GET";
                myHttpWebRequest.CookieContainer = new CookieContainer();

                try
                {
                    HttpWebResponse response = (HttpWebResponse)myHttpWebRequest.GetResponse();
                    // 從 ResponseStream 中讀取HTML源碼并格式化 add by cqp
                    result.Html = readResponseStream(response, Chareset);
                    result.CookieContainer = myHttpWebRequest.CookieContainer;
                     return result.Html;                
                }
                catch (Exception ex)
                {
                    return ex.ToString();
                }
               
            }
            public string readResponseStream(HttpWebResponse response, string Chareset)
            {
                string result = "";
                using (StreamReader responseReader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(Chareset)))
                {         
                    result = formatHTML(responseReader.ReadToEnd());
                }

                return result;
            }
            /// <summary>
            /// 描述:格式化網頁源碼
            ///
            /// </summary>
            /// <param name="htmlContent"></param>
            /// <returns></returns>
            public string formatHTML(string htmlContent)
            {
                string result = "";

                result = htmlContent.Replace("&raquo;", "").Replace("&nbsp;", "")
                        .Replace("&copy;", "").Replace("/r", "").Replace("/t", "")
                        .Replace("/n", "").Replace("&amp;", "&");

                return result;
            }

            class BaiduSearch
            {
                protected string uri = "http://www.baidu.com/s?wd=";
                //protected string uri = "http://www.baidu.com/s?wd=software&pn=10&usm=2"; // 第二頁
                protected Encoding queryEncoding = Encoding.GetEncoding("gb2312");
                protected Encoding pageEncoding = Encoding.GetEncoding("gb2312");
                protected string resultPattern = @"(?<=找到相關結果[約]?)[0-9,]*?(?=個)";
                public int GetSearchCount(string html)
                {
                    int result = 0;
                    string searchcount = string.Empty;

                    Regex regex = new Regex(resultPattern);
                    Match match = regex.Match(html);

                    if (match.Success)
                    {
                        searchcount = match.Value;
                    }
                    else
                    {
                        searchcount = "0";
                    }

                    if (searchcount.IndexOf(",") > 0)
                    {
                        searchcount = searchcount.Replace(",", string.Empty);
                    }

                    int.TryParse(searchcount, out result);

                    return result;
                }

                public List<Keyword> GetKeywords(string html, string word)
                {
                    int i=1;
                    List<Keyword> keywords = new List<Keyword>();

                    Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{1,2}|100)/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                    //Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                    Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);

                    MatchCollection mcTable = regTable.Matches(html);
                    foreach (Match mTable in mcTable)
                    {
                        if (mTable.Success)
                        {
                            Match mA = regA.Match(mTable.Value);
                            if (mA.Success)
                            {

                                Keyword keyword = new Keyword();
                                keyword.ID=i++;
                                keyword.Link = mA.Groups["link"].Value;
                                keyword.Title = mA.Groups["title"].Value;
                                keywords.Add(keyword);
                            }
                        }
                    }

                    return keywords;
                }
            }
            class googleSearch
            {
                public List<Keyword> GetKeywords(string html, string word)
                {
                    int i = 1;
                    List<Keyword> keywords = new List<Keyword>();

                    Regex regTable = new Regex(@"(?is)<h3[^>]*?>(?><h3[^>]*>(?<o>)|</h3>(?<-o>)|(?:(?!</?h3/b).)*)*(?(o)(?!))</h3>", RegexOptions.IgnoreCase);
                    //Regex regTable = new Regex(@"(?is)<table[^>]*?id=(['""]?)(/d{2})/1[^>]*>(?><table[^>]*>(?<o>)|</table>(?<-o>)|(?:(?!</?table/b).)*)*(?(o)(?!))</table>", RegexOptions.IgnoreCase);
                    Regex regA = new Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>", RegexOptions.IgnoreCase);

                    MatchCollection mcTable = regTable.Matches(html);
                    foreach (Match mTable in mcTable)
                    {
                        if (mTable.Success)
                        {
                            Match mA = regA.Match(mTable.Value);
                            if (mA.Success)
                            {

                                Keyword keyword = new Keyword();
                                keyword.ID = i++;
                                keyword.Link = mA.Groups["link"].Value;
                                keyword.Title = mA.Groups["title"].Value;
                                keywords.Add(keyword);
                            }
                        }
                    }

                    return keywords;
                }
            }
            class Keyword
            {
                public int ID { get; set; }
                public string Title { get; set; }
                public string Link { get; set; }
                //private string title;
                //public string Title { get { return title; } set { title = value; } }
                //private string link;
                //public string Link { get { return link; } set { link = value; } }
            }
        }
    }
    HttpState:

    using System.Net;
    using System.Collections;

    namespace httpState
    {
        public class HttpState
        {

            // 獲取與響應一起返回的狀態說明。
            private string _statusDescription;

            public string StatusDescription
            {
                get { return _statusDescription; }
                set { _statusDescription = value; }
            }

            /// <summary>
            /// 回調 址址, 登陸測試中使用
            /// </summary>
            private string _callBackUrl;

            public string CallBackUrl
            {
                get { return _callBackUrl; }
                set { _callBackUrl = value; }
            }


            /// <summary>
            /// 網頁網址 絕對路徑格式
            /// </summary>
            private string _url;

            public string Url
            {
                get { return _url; }
                set { _url = value; }
            }

            /// <summary>
            /// 字符串的形式的Cookie信息
            /// </summary>
            private string _cookies;

            public string Cookies
            {
                get { return _cookies; }
                set { _cookies = value; }
            }

            /// <summary>
            /// Cookie信息
            /// </summary>
            private CookieContainer _cookieContainer = new CookieContainer();

            public CookieContainer CookieContainer
            {
                get { return _cookieContainer; }
                set { _cookieContainer = value; }
            }

            /// <summary>
            /// 網頁源碼
            /// </summary>
            private string _html;

            public string Html
            {
                get { return _html; }
                set { _html = value; }
            }

            /// <summary>
            /// 驗證碼臨時文件(絕對路徑)
            /// </summary>
            private string _tmpValCodePic;

            public string TmpValCodePic
            {
                get { return _tmpValCodePic; }
                set { _tmpValCodePic = value; }
            }

            /// <summary>
            /// 驗證碼臨時文件名(相對路徑)
            /// </summary>
            private string _tmpValCodeFileName = "emptyPic.gif";

            public string TmpValCodeFileName
            {
                get { return _tmpValCodeFileName; }
                set { _tmpValCodeFileName = value; }
            }

            /// <summary>
            /// 有驗證碼
            /// </summary>
            private bool _isValCode;

            public bool IsValCode
            {
                get { return _isValCode; }
                set { _isValCode = value; }
            }

            /// <summary>
            /// 驗證碼URL
            /// </summary>
            private string _valCodeURL;

            public string ValCodeURL
            {
                get { return _valCodeURL; }
                set { _valCodeURL = value; }
            }

            /// <summary>
            /// 驗證碼識別后的值
            /// </summary>
            private string _valCodeValue;

            public string ValCodeValue
            {
                get { return _valCodeValue; }
                set { _valCodeValue = value; }
            }

            /// <summary>
            /// 其它參數
            /// </summary>
            private Hashtable _otherParams = new Hashtable();

            public Hashtable OtherParams
            {
                get { return _otherParams; }
                set { _otherParams = value; }
            }

            // 重復添加處理 add by fengcj  09/11/19 PM
            public void addOtherParam(object key, object value)
            {
                if (!this.OtherParams.ContainsKey(key))
                    this.OtherParams.Add(key, value);
                else
                {
                    this.OtherParams[key] = value;
                }
            }

            public void removeOtherParam(object key)
            {
                this.OtherParams.Remove(key);
            }

            public object getOtherParam(object key)
            {
                return this.OtherParams[key];
            }
        }
    }

     界面很簡單一個輸入框,兩個搜索按鈕和一個datagridview



    分享到:
    上一篇:用HTML的方式實現IE瀏覽器的菜單命令集錦
    下一篇:如何在Windows 8上安裝配置IIS8.0的環境
     

    本站業務:聊城網站建設-聊城網站制作-聊城做網站  
    皇家社会vs莱万特 大乐透胆拖兑奖表 金英权 新会员送88彩金 快三如何选号技巧 天下彩票免费资枓大全 最火爆的棋牌游戏 平·特一肖 北京pk10一天多少期 老时时彩开奖号码 赌博押牛牛做庄的技巧 快速时时官网 大赢家比分直播