zoukankan      html  css  js  c++  java
  • 百度百科词条采集

     https://baike.baidu,com/view/? 方式尽可能遍历百科词条
     1 # -*- coding: utf-8 -*-
     2 # @time : 2019/7/1  14:56
     3 import requests
     4 import random
     5 from multiprocessing import Process, Pool
     6 import pymysql
     7 
     8 '''
     9 通过组装“https://baike.baidu.com/view/”+数字的方式进行多进程遍历。
    10 '''
    11 
    12 mysql_ip = ''
    13 mysql_port =
    14 mysql_user = ''
    15 mysql_passwd = ''
    16 msyql_db = ''
    17 
    18 process_num = 5
    19 
    20 baseUrl = 'https://baike.baidu.com/view/'
    21 headers = {
    22     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
    23     'Referer': 'https://www.baidu.com/',
    24     'Accept-Encoding': 'gzip, deflate, br'
    25 }
    26 ip_pool = [
    27     '119.98.44.192:8118',
    28     '111.198.219.151:8118',
    29     '101.86.86.101:8118',
    30 ]
    31 
    32 connection = pymysql.connect(host=mysql_ip, port=mysql_port, user=mysql_user, passwd=mysql_passwd, db=msyql_db)
    33 cursor = connection.cursor()
    34 filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8")
    35 
    36 
    37 def ip_proxy():
    38     ip = ip_pool[random.randrange(0, 3)]
    39     proxy_ip = 'https://' + ip
    40     proxies = {'http': proxy_ip}
    41     return proxies
    42 
    43 
    44 def sprider(start_index, end_index):
    45     for i in range(start_index, end_index):
    46         try:
    47             response = requests.get(baseUrl + str(i), proxies=ip_proxy(), headers=headers, timeout=1)
    48             if 'error' in response.url:
    49                 pass
    50             else:
    51                 id = i
    52                 url = requests.utils.unquote(response.url)
    53                 url_cotent = response.text.encode(encoding='ISO-8859-1').decode('utf8')
    54                 sql = 'insert into baikebaiku (id,url,html_content) values(%s,%s,%s)'
    55                 cursor.execute(sql, (id, url, url_cotent))
    56                 connection.commit()
    57                 print("" + str(i) + "个,添加数据库成功")
    58         except Exception as e:
    59             filedWriter.write(str(i) + '
    ')
    60             filedWriter.flush()
    61             print(e.args)
    62 
    63 
    64 if __name__ == '__main__':
    65 
    66     pool = Pool(processes=process_num)
    67 
    68     one_process_task_num = 20000000 // process_num
    69 
    70     for i in range(process_num):
    71         pool.apply_async(sprider, args=[one_process_task_num * i, one_process_task_num * (i + 1)])
    72 
    73     pool.close()
    74     pool.join()

    以上采集的可能有重复,并且少了很多。

    因此我修改了采集方式,以id为主,以界面中a.href为辅 的手段完成采集。 并且此次采用scrapy完成采集。

    目录:

        

    import scrapy
    import requests
    import sys
    
    from scrapy import Request
    
    
    class BaikeSpider(scrapy.Spider):
        name = 'baike'
        allowed_domains = ['baike.baidu.com']
    
        start_urls = ['https://baike.baidu.com/view/' + str(id) for id in range(1, 25000000)]
    
        has_crawled_urls = dict()
    
        def parse(self, response):
    
            if 'error' in response.url:
                pass
            else:
                url = requests.utils.unquote(response.url)
    
                url_split = url.split("?")[0].split("/")
                name = url_split[-2]
                unique_id = url_split[-1]
    
                if 'item'.__eq__(name):
                    name=unique_id
                    unique_id=str(-1)
    
                if name in BaikeSpider.has_crawled_urls.keys():  # 这个人是有歧义的
                    if unique_id in BaikeSpider.has_crawled_urls[name]:
                        pass
                    else:  # 保存
                        # 先存储当前页面
                        url_cotent = str(response.body, encoding="utf-8")
                        with open(str(name) + "_" + str(unique_id) + ".html", 'w', encoding="utf-8") as f:
                            f.write(url_cotent)
    
                        BaikeSpider.has_crawled_urls[name].add(unique_id)
    
                        res = response.xpath('//a')
                        for id, i in enumerate(res):
                            candicate = i.xpath('@href').extract()
    
                            if candicate!=None and len(candicate) > 0 and candicate[0].startswith("/item/"):
                                yield Request("https://baike.baidu.com" + candicate[0])
    
                else:
                    # 先存储当前页面
                    url_cotent = str(response.body, encoding="utf-8")
                    with open(str(name) + "_" + str(unique_id)+".html", 'w', encoding="utf-8") as f:
                        f.write(url_cotent)
    
                    BaikeSpider.has_crawled_urls[name]=set()
    
                    res = response.xpath('//a')
                    for id, i in enumerate(res):
                        candicate = i.xpath('@href').extract()
                        if candicate!=None and len(candicate) > 0 and candicate[0].startswith("/item/"):
                            yield Request("https://baike.baidu.com" + candicate[0])
    
    
    
    
    
    
    
    
            # filename = "teacher.html"
            #
            # url = requests.utils.unquote(response.url)
            # print(str(response.body, encoding = "utf-8"))
            # url_cotent = str(response.body, encoding = "utf-8")
            #
            # f = open(filename, 'w',encoding="utf-8")
            # f.write(url_cotent)
            # f.close()
    baike.py
    # Scrapy settings for baikeSprider project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'baikeSprider'
    
    SPIDER_MODULES = ['baikeSprider.spiders']
    NEWSPIDER_MODULE = 'baikeSprider.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'baikeSprider (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
        'Referer': 'https://www.baidu.com/',
        'Accept-Encoding': 'gzip, deflate, br'
    }
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'baikeSprider.middlewares.BaikespriderSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'baikeSprider.middlewares.BaikespriderDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    #ITEM_PIPELINES = {
    #    'baikeSprider.pipelines.BaikespriderPipeline': 300,
    #}
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    setting,py

     

  • 相关阅读:
    解决KDE桌面附带文件索引框架Baloo占用资源过多问题
    [Journey with golang] 7. Traps
    [Journey with golang] 6. Reflection
    Codeforces Round #614 (Div. 2)
    [Journey with golang] 5. Concurrent
    2018-2019 9th BSUIR Open Programming Championship
    2019-2020 ACM-ICPC Pacific Northwest Regional Contest
    UFPE Starters Final Try-Outs 2020
    2019 ICPC Asia Taipei Hsinchu Regional Contest
    [Journey with golang] 4. Interface
  • 原文地址:https://www.cnblogs.com/dhName/p/11115696.html
Copyright © 2011-2022 走看看