https://baike.baidu,com/view/? 方式尽可能遍历百科词条
1 # -*- coding: utf-8 -*- 2 # @time : 2019/7/1 14:56 3 import requests 4 import random 5 from multiprocessing import Process, Pool 6 import pymysql 7 8 ''' 9 通过组装“https://baike.baidu.com/view/”+数字的方式进行多进程遍历。 10 ''' 11 12 mysql_ip = '' 13 mysql_port = 14 mysql_user = '' 15 mysql_passwd = '' 16 msyql_db = '' 17 18 process_num = 5 19 20 baseUrl = 'https://baike.baidu.com/view/' 21 headers = { 22 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36', 23 'Referer': 'https://www.baidu.com/', 24 'Accept-Encoding': 'gzip, deflate, br' 25 } 26 ip_pool = [ 27 '119.98.44.192:8118', 28 '111.198.219.151:8118', 29 '101.86.86.101:8118', 30 ] 31 32 connection = pymysql.connect(host=mysql_ip, port=mysql_port, user=mysql_user, passwd=mysql_passwd, db=msyql_db) 33 cursor = connection.cursor() 34 filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8") 35 36 37 def ip_proxy(): 38 ip = ip_pool[random.randrange(0, 3)] 39 proxy_ip = 'https://' + ip 40 proxies = {'http': proxy_ip} 41 return proxies 42 43 44 def sprider(start_index, end_index): 45 for i in range(start_index, end_index): 46 try: 47 response = requests.get(baseUrl + str(i), proxies=ip_proxy(), headers=headers, timeout=1) 48 if 'error' in response.url: 49 pass 50 else: 51 id = i 52 url = requests.utils.unquote(response.url) 53 url_cotent = response.text.encode(encoding='ISO-8859-1').decode('utf8') 54 sql = 'insert into baikebaiku (id,url,html_content) values(%s,%s,%s)' 55 cursor.execute(sql, (id, url, url_cotent)) 56 connection.commit() 57 print("第" + str(i) + "个,添加数据库成功") 58 except Exception as e: 59 filedWriter.write(str(i) + ' ') 60 filedWriter.flush() 61 print(e.args) 62 63 64 if __name__ == '__main__': 65 66 pool = Pool(processes=process_num) 67 68 one_process_task_num = 20000000 // process_num 69 70 for i in range(process_num): 71 pool.apply_async(sprider, args=[one_process_task_num * i, one_process_task_num * (i + 1)]) 72 73 pool.close() 74 pool.join()
以上采集的可能有重复,并且少了很多。
因此我修改了采集方式,以id为主,以界面中a.href为辅 的手段完成采集。 并且此次采用scrapy完成采集。
目录:
import scrapy import requests import sys from scrapy import Request class BaikeSpider(scrapy.Spider): name = 'baike' allowed_domains = ['baike.baidu.com'] start_urls = ['https://baike.baidu.com/view/' + str(id) for id in range(1, 25000000)] has_crawled_urls = dict() def parse(self, response): if 'error' in response.url: pass else: url = requests.utils.unquote(response.url) url_split = url.split("?")[0].split("/") name = url_split[-2] unique_id = url_split[-1] if 'item'.__eq__(name): name=unique_id unique_id=str(-1) if name in BaikeSpider.has_crawled_urls.keys(): # 这个人是有歧义的 if unique_id in BaikeSpider.has_crawled_urls[name]: pass else: # 保存 # 先存储当前页面 url_cotent = str(response.body, encoding="utf-8") with open(str(name) + "_" + str(unique_id) + ".html", 'w', encoding="utf-8") as f: f.write(url_cotent) BaikeSpider.has_crawled_urls[name].add(unique_id) res = response.xpath('//a') for id, i in enumerate(res): candicate = i.xpath('@href').extract() if candicate!=None and len(candicate) > 0 and candicate[0].startswith("/item/"): yield Request("https://baike.baidu.com" + candicate[0]) else: # 先存储当前页面 url_cotent = str(response.body, encoding="utf-8") with open(str(name) + "_" + str(unique_id)+".html", 'w', encoding="utf-8") as f: f.write(url_cotent) BaikeSpider.has_crawled_urls[name]=set() res = response.xpath('//a') for id, i in enumerate(res): candicate = i.xpath('@href').extract() if candicate!=None and len(candicate) > 0 and candicate[0].startswith("/item/"): yield Request("https://baike.baidu.com" + candicate[0]) # filename = "teacher.html" # # url = requests.utils.unquote(response.url) # print(str(response.body, encoding = "utf-8")) # url_cotent = str(response.body, encoding = "utf-8") # # f = open(filename, 'w',encoding="utf-8") # f.write(url_cotent) # f.close()
# Scrapy settings for baikeSprider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'baikeSprider' SPIDER_MODULES = ['baikeSprider.spiders'] NEWSPIDER_MODULE = 'baikeSprider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'baikeSprider (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36', 'Referer': 'https://www.baidu.com/', 'Accept-Encoding': 'gzip, deflate, br' } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'baikeSprider.middlewares.BaikespriderSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'baikeSprider.middlewares.BaikespriderDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html #ITEM_PIPELINES = { # 'baikeSprider.pipelines.BaikespriderPipeline': 300, #} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'