爬取赶集网二手物品下所有物品的信息。
大致思路:
1、爬取频道页url;
2、爬取商品详情页url,写入mongodb,url_list表;
3、从url_list表读取url,爬取商品信息,写入mongodb,p_info表
分成3个py文件:
1、channel_url.py,获取频道页url;
2、page_parse.py,主要是2个爬虫函数,分别完成2个数据表的写入;
3、main.py,主程序文件,也就是本文件,开启多进程,完成整个工作
最后顺利完成任务,感觉现在赶集真不行了,没多少数据。
channel_url.py文件:
import requests from bs4 import BeautifulSoup start_url = 'http://bj.ganji.com/wu/' url_host = 'http://bj.ganji.com' def get_channel_url(url): channel_urls = [] raw_data = requests.get(url).text soup = BeautifulSoup(raw_data,'lxml') eles = soup.select('div.content dt>a') for e in eles: channel_url = url_host + e.get('href') print(channel_url) channel_urls.append(channel_url) return channel_urls # channel_urls = get_channel_url(start_url) # print('len(channel_urls):',len(channel_urls)) # 这是程序运行的结果,直接保存下来了,就不用再运行get_channel_url()了 channel_urls = ''' http://bj.ganji.com/jiaju/ http://bj.ganji.com/rirongbaihuo/ http://bj.ganji.com/shouji/ http://bj.ganji.com/bangong/ http://bj.ganji.com/nongyongpin/ http://bj.ganji.com/jiadian/ http://bj.ganji.com/ershoubijibendiannao/ http://bj.ganji.com/ruanjiantushu/ http://bj.ganji.com/yingyouyunfu/ http://bj.ganji.com/diannao/ http://bj.ganji.com/xianzhilipin/ http://bj.ganji.com/fushixiaobaxuemao/ http://bj.ganji.com/meironghuazhuang/ http://bj.ganji.com/shuma/ http://bj.ganji.com/laonianyongpin/ http://bj.ganji.com/xuniwupin/ http://bj.ganji.com/qitawupin/ http://bj.ganji.com/ershoufree/ http://bj.ganji.com/wupinjiaohuan/ '''
page_parse.py文件:
import requests from bs4 import BeautifulSoup from time import sleep from pymongo import MongoClient client = MongoClient('localhost',27017) ganji = client['ganji'] url_list = ganji['url_list'] p_info = ganji['p_info'] # 给定频道url,爬取此频道下所有商品的url,打印并写入mongo数据库 def get_product_url(url): channel_url = url page_num = 1 while True: raw_page = requests.get(url).text print('正在get网页:',url) sleep(2) soup = BeautifulSoup(raw_page,'lxml') eles = soup.select('a.ft-tit') print('len(eles):',len(eles)) for e in eles: p_url = e.get('href') url_list.insert_one({'p_url':p_url}) print(p_url) if soup.select('a.next'): page_num += 1 url = channel_url + 'o' + str(page_num) + '/' else: break # 给定商品详情页url,爬取商品具体信息,打印并写入mongo数据库 def get_product_info(url): raw_page = requests.get(url).text sleep(2) soup = BeautifulSoup(raw_page,'lxml') if soup.select("p:contains('信息刚被删除~')"): # 判断商品信息是否已经删除 print('信息刚被删除~') pass else: title = soup.select('h1.title-name')[0].get_text() if soup.select('h1.title-name') else None category = list(soup.select('div.crumbs.routes.clearfix')[0].stripped_strings) if soup.select('div.crumbs.routes.clearfix') else None date = soup.select('i.pr-5')[0].get_text().split('\')[0].strip() if soup.select('i.pr-5') else None price = soup.select('i.f22.fc-orange.f-type')[0].get_text() if soup.select('i.f22.fc-orange.f-type') else None address = soup.select('ul.det-infor>li:nth-child(2)>a')[0].get_text() if soup.select('ul.det-infor>li:nth-child(2)>a') else None p_dict = {'title':title,'category':category,'date':date,'price':price,'address':address,'url':url} p_info.insert_one(p_dict) print(p_dict)
main.py文件:
from channel_url import channel_urls # 从channel_url.py导入某变量,会把channel_url.py都执行一遍,但变量只在模块内部保留 from page_parse import get_product_url, get_product_info, url_list # 需要导入url_list from multiprocessing import Pool from datetime import datetime # 从mongodb中读取商品url,返回所有商品的url def read_all_p_urls(): all_p_urls = [] for item in url_list.find(): all_p_urls.append(item['p_url']) return all_p_urls if __name__ == '__main__': start_time = datetime.now() # 不用多进程的方式,耗时会多好几倍 # for channel in channel_urls.split(): # get_product_url(channel) pool = Pool() #用多进程的方式,4进程和自动分配进程耗时差不多 #pool = Pool(processes=4) # 根据channel url,获取商品url,写入mongodb pool.map(get_product_url,channel_urls.split()) # 根据商品url,获取商品信息,写入mongodb;这一句可以跟上面那句分开执行 pool.map(get_product_info,read_all_p_urls()) end_time = datetime.now() during = end_time - start_time print('总共耗时:',during)