zoukankan      html  css  js  c++  java
  • python单线程爬虫code

    广度优先算法:

    # -*- coding: utf-8 -*- 
    import urllib
    import urllib.request
    from bs4 import BeautifulSoup
    import threading
    mylock = threading.RLock()
    class Crawler:
        unVisitUrl = set()
        visitedUrl = []
        def getHtml(self , url):
    
            html = ''
            req = urllib.request.Request(url , headers = {
                'Connection': 'Keep-Alive',
                'Accept': 'text/html, application/xhtml+xml, */*',
                'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
            })
            try:
                respose = urllib.request.urlopen(req , timeout = 10)
                html = respose.read().decode('UTF-8').replace('&nbsp','')
            except Exception as e:
                pass
    
            return html;
    
        def getUrlFromHtml(self , html , sitePath):
    
            if(html):
                soup = BeautifulSoup(html , 'html.parser')
                aList = soup.find_all('a')
                for a in aList:
                    try:
                        if sitePath in a['href'] and a['href'].startswith('http://'):
                            self.addUnVisitUrl(a['href'])
                            self.addVisitedUrl(a['href'])
                    except KeyError:
                        pass
    
        # 解析网页内容
        def analysis(self , url , sitePath):
    
            self.initUnVisitUrl(url)
    
            while(len(self.unVisitUrl) > 0):
                visitingUrl = self.getUnVisitUrl()
                print(visitingUrl)
                if(visitingUrl):
                    html = self.getHtml(visitingUrl)
                    if(html):
                        # 获取网页中所有内部链接,存储
                        self.getUrlFromHtml(html , sitePath)
    
        # 初始化根链接
        def initUnVisitUrl(self , url):
            self.unVisitUrl.add(url)
    
        def addUnVisitUrl(self , url):
            if url not in self.unVisitUrl and url not in self.visitedUrl:
                self.unVisitUrl.add(url)
    
        def getUnVisitUrl(self):
    
            url = None
            unVisitUrlTmp = list(self.unVisitUrl)
            if unVisitUrlTmp[0]:
                url = unVisitUrlTmp[0]
                self.unVisitUrl.remove(url)
    
            return url
    
        def addVisitedUrl(self , url):
            self.visitedUrl.append(url)
  • 相关阅读:
    第一次博客作业
    第一次软工作业
    第一次博客作业
    MySQL查看当前用户
    JSP官方文档(JavaServer Pages Specification)下载
    第一次博客作业
    第一次个人编程作业
    第一次博客作业
    第一次个人编程作业
    第一次博客作业
  • 原文地址:https://www.cnblogs.com/orlion/p/4895449.html
Copyright © 2011-2022 走看看