zoukankan html css js c++ java

百度百科词条采集

https://baike.baidu,com/view/? 方式尽可能遍历百科词条

 1 # -*- coding: utf-8 -*-
 2 # @time : 2019/7/1  14:56
 3 import requests
 4 import random
 5 from multiprocessing import Process, Pool
 6 import pymysql
 7 
 8 '''
 9 通过组装“https://baike.baidu.com/view/”+数字的方式进行多进程遍历。
10 '''
11 
12 mysql_ip = ''
13 mysql_port =
14 mysql_user = ''
15 mysql_passwd = ''
16 msyql_db = ''
17 
18 process_num = 5
19 
20 baseUrl = 'https://baike.baidu.com/view/'
21 headers = {
22     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
23     'Referer': 'https://www.baidu.com/',
24     'Accept-Encoding': 'gzip, deflate, br'
25 }
26 ip_pool = [
27     '119.98.44.192:8118',
28     '111.198.219.151:8118',
29     '101.86.86.101:8118',
30 ]
31 
32 connection = pymysql.connect(host=mysql_ip, port=mysql_port, user=mysql_user, passwd=mysql_passwd, db=msyql_db)
33 cursor = connection.cursor()
34 filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8")
35 
36 
37 def ip_proxy():
38     ip = ip_pool[random.randrange(0, 3)]
39     proxy_ip = 'https://' + ip
40     proxies = {'http': proxy_ip}
41     return proxies
42 
43 
44 def sprider(start_index, end_index):
45     for i in range(start_index, end_index):
46         try:
47             response = requests.get(baseUrl + str(i), proxies=ip_proxy(), headers=headers, timeout=1)
48             if 'error' in response.url:
49                 pass
50             else:
51                 id = i
52                 url = requests.utils.unquote(response.url)
53                 url_cotent = response.text.encode(encoding='ISO-8859-1').decode('utf8')
54                 sql = 'insert into baikebaiku (id,url,html_content) values(%s,%s,%s)'
55                 cursor.execute(sql, (id, url, url_cotent))
56                 connection.commit()
57                 print("第" + str(i) + "个,添加数据库成功")
58         except Exception as e:
59             filedWriter.write(str(i) + '
')
60             filedWriter.flush()
61             print(e.args)
62 
63 
64 if __name__ == '__main__':
65 
66     pool = Pool(processes=process_num)
67 
68     one_process_task_num = 20000000 // process_num
69 
70     for i in range(process_num):
71         pool.apply_async(sprider, args=[one_process_task_num * i, one_process_task_num * (i + 1)])
72 
73     pool.close()
74     pool.join()

以上采集的可能有重复，并且少了很多。

因此我修改了采集方式，以id为主，以界面中a.href为辅的手段完成采集。并且此次采用scrapy完成采集。

import scrapy
import requests
import sys

from scrapy import Request


class BaikeSpider(scrapy.Spider):
    name = 'baike'
    allowed_domains = ['baike.baidu.com']

    start_urls = ['https://baike.baidu.com/view/' + str(id) for id in range(1, 25000000)]

    has_crawled_urls = dict()

    def parse(self, response):

        if 'error' in response.url:
            pass
        else:
            url = requests.utils.unquote(response.url)

            url_split = url.split("?")[0].split("/")
            name = url_split[-2]
            unique_id = url_split[-1]

            if 'item'.__eq__(name):
                name=unique_id
                unique_id=str(-1)

            if name in BaikeSpider.has_crawled_urls.keys():  # 这个人是有歧义的
                if unique_id in BaikeSpider.has_crawled_urls[name]:
                    pass
                else:  # 保存
                    # 先存储当前页面
                    url_cotent = str(response.body, encoding="utf-8")
                    with open(str(name) + "_" + str(unique_id) + ".html", 'w', encoding="utf-8") as f:
                        f.write(url_cotent)

                    BaikeSpider.has_crawled_urls[name].add(unique_id)

                    res = response.xpath('//a')
                    for id, i in enumerate(res):
                        candicate = i.xpath('@href').extract()

                        if candicate!=None and len(candicate) > 0 and candicate[0].startswith("/item/"):
                            yield Request("https://baike.baidu.com" + candicate[0])

            else:
                # 先存储当前页面
                url_cotent = str(response.body, encoding="utf-8")
                with open(str(name) + "_" + str(unique_id)+".html", 'w', encoding="utf-8") as f:
                    f.write(url_cotent)

                BaikeSpider.has_crawled_urls[name]=set()

                res = response.xpath('//a')
                for id, i in enumerate(res):
                    candicate = i.xpath('@href').extract()
                    if candicate!=None and len(candicate) > 0 and candicate[0].startswith("/item/"):
                        yield Request("https://baike.baidu.com" + candicate[0])








        # filename = "teacher.html"
        #
        # url = requests.utils.unquote(response.url)
        # print(str(response.body, encoding = "utf-8"))
        # url_cotent = str(response.body, encoding = "utf-8")
        #
        # f = open(filename, 'w',encoding="utf-8")
        # f.write(url_cotent)
        # f.close()

baike.py

# Scrapy settings for baikeSprider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'baikeSprider'

SPIDER_MODULES = ['baikeSprider.spiders']
NEWSPIDER_MODULE = 'baikeSprider.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'baikeSprider (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
    'Referer': 'https://www.baidu.com/',
    'Accept-Encoding': 'gzip, deflate, br'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'baikeSprider.middlewares.BaikespriderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'baikeSprider.middlewares.BaikespriderDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'baikeSprider.pipelines.BaikespriderPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

setting,py

查看全文

相关阅读:
"use strict"详解
 HTML5 文件上传
 jquery $(document).ready() 与window.onload的区别
 前端面试题——错题集
 css-子div设置margin-top影响父div
常见的dom操作----原生JavaScript与jQuery
前端面试题——错题集
 JavaScript正则表达式知识点
 越权漏洞
 php反系列化原理和演示

原文地址：https://www.cnblogs.com/dhName/p/11115696.html