class Crawler_Method { public static Dictionary<String, String> GETCity() { String html=GET("https://www.zhaopin.com/citymap.html");//调用网络请求函数 return Parse1(html);//调用解析函数解析网页得到数据 } public static Dictionary<String, String> Parse1(String html) { Dictionary<String, String> map = new Dictionary<string, string>(); Document doc = NSoup.NSoupClient.Parse(html);//将网页返回的数据用Nsoup初始化为document文档进行结构初始化 Elements elements = doc.GetElementsByClass("col1");//得到属性class为col1的元素 Elements e = elements[0].Select("a");//得到tag为a的元素 for (int x=0;x<e.Count;x++)//对得到的多个a 进行循环获得数据 { Element a = e[x]; if(a.Attr("href").Substring(2).StartsWith("www"))//得到目标数据 map.Add(a.Text(), "https://" + a.Attr("href").Substring(2)); } return map; } public static Dictionary<String, String> GetCompany(String url) { String html = GET(url); return Parse2(html); } public static Dictionary<String, String> Parse2(String html)//此方法注释与上同 { Dictionary<String, String> map = new Dictionary<string, string>(); Document doc = NSoup.NSoupClient.Parse(html); Elements elements = doc.GetElementsByClass("nctt"); Elements e = elements[0].Select("li"); for (int x = 0; x < e.Count; x++) { Element a = e[x]; map.Add(a.Select("a")[0].Text(),a.Select("a")[0].Attr("href")); } return map; } public static String GET(String share_url) { HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(share_url);//通过网页链接对网络请求类进行初始化 request.Method = "GET";//设置请求方式为get request.AllowAutoRedirect = true;//允许网页重定向 // request.Headers.Set("Content-Range", " bytes 0 - 126399 / 8065760"); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"; //反爬虫的设置 设置浏览器标识 HttpWebResponse response = (HttpWebResponse)request.GetResponse();//得到网页返回reponse对象 //Console.WriteLine(response.Headers.ToString()); Stream stream = response.GetResponseStream();//得到网页输出流 StreamReader read = new StreamReader(stream, System.Text.Encoding.GetEncoding("utf-8"));//对返回的数据进行解码 String nextline = ""; String html = ""; while ((nextline = read.ReadLine()) != null)//不断地读取输入流,读取网页源码 { html += nextline; } read.Close(); return html;//返回网页源码 } }