zoukankan      html  css  js  c++  java
  • mini爬虫程序

    Code
    class MiniCrawler
        {
            
    // Find a link in a content string.
            static string FindLink(string htmlstr, ref int startloc)
            {
                
    int i;
                
    int start, end;
                
    string uri = null;
                
    string lowcasestr = htmlstr.ToLower();
                i 
    = lowcasestr.IndexOf("href=\"http", startloc);
                if (i != -1)
                {
                    start 
    = htmlstr.IndexOf('"', i) + 1;
                    end 
    = htmlstr.IndexOf('"', start);
                    uri 
    = htmlstr.Substring(start, end - start);
                    startloc 
    = end;
                }
                
    return uri;

            }
            
    public static void Crawle(string uristr)
            {
                
    string link = null;
                
    string str;
                
    string answer;
                
    int curloc; // holds current location in response
                try
                {
                    
    do
                    {
                        Console.WriteLine(
    "Linking to " + uristr);
                        
    // 创建一个指定URI的WebRequest
                        HttpWebRequest req = (HttpWebRequest)
                        WebRequest.Create(uristr);

                        
    // 发送reques得到返回的response.
                        HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
                        
    // 从返回的内容中获得数据流
                        Stream istrm = resp.GetResponseStream();

                        StreamReader rdr 
    = new StreamReader(istrm);
                        
    // 读取整个页面
                        str = rdr.ReadToEnd();
                        curloc 
    = 0;
                        
    do
                        {
                            
    // 查找下一个uri
                            link = FindLink(str, ref curloc);
                            
    if (link != null)
                            {
                                Console.WriteLine(
    "发现链接: " + link);
                                Console.Write(
    "Link, More, Quit?");
                                answer 
    = Console.ReadLine();
                                
    if (string.Compare(answer, "L"true== 0)
                                {
                                    uristr 
    = string.Copy(link);
                                    
    break;
                                }
                                
    else if (string.Compare(answer, "Q"true== 0)
                                {
                                    
    break;
                                }
                                
    else if (string.Compare(answer, "M"true== 0)
                                {
                                    Console.WriteLine(
    "Searching for another link.");
                                }
                            }
                            
    else
                            {
                                Console.WriteLine(
    "No link found.");
                                
    break;
                            }
                        } 
    while (link.Length > 0);
                        
    // Close the response.
                        resp.Close();
                    } 
    while (uristr != null);
                }
                
    catch (WebException exc)
                {
                    Console.WriteLine(
    "Network Error: " + exc.Message +
                    
    "\nStatus code: " + exc.Status);
                }
                
    catch (ProtocolViolationException exc)
                {
                    Console.WriteLine(
    "Protocol Error: " + exc.Message);
                }
                
    catch (UriFormatException exc)
                {
                    Console.WriteLine(
    "URI Format Error: " + exc.Message);
                }
                
    catch (NotSupportedException exc)
                {
                    Console.WriteLine(
    "Unknown Protocol: " + exc.Message);
                }
                
    catch (IOException exc)
                {
                    Console.WriteLine(
    "I/O Error: " + exc.Message);
                }
                Console.WriteLine(
    "Terminating MiniCrawler.");
            }
        }
  • 相关阅读:
    钉钉outgoing机器人小项目开发
    js根据cookie判断,一天之内只弹出一次弹窗
    js倒计时功能
    jquery的$().each,$.each的区别
    VS代码提示自动高亮
    winform当前屏幕大小
    动态增删改控件
    datagridveiw样式
    sql 语句 提取中文的首字母
    按键监听及重写
  • 原文地址:https://www.cnblogs.com/nuaalfm/p/1410354.html
Copyright © 2011-2022 走看看