方案1:一次性爬取全部淘宝美食信息
1. spider.py文件如下
1 __author__ = 'Administrator' 2 from selenium import webdriver 3 from selenium.webdriver.common.by import By 4 from selenium.webdriver.support.ui import WebDriverWait 5 from selenium.webdriver.support import expected_conditions as EC 6 import re 7 from pyquery import PyQuery as pq 8 from config import * 9 import pymongo 10 11 client = pymongo.MongoClient(MONGO_URL) 12 db = client[MONGO_DB] 13 14 browser = webdriver.Chrome() 15 """ 16 如果把Chrome修改为使用PhantomJS 17 1. 首先需要安装phantomJS 18 2. 自定义一些配置参数,这里不加载图片以及使用缓存 19 browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 20 3. 设置窗口大小 21 browser.set_window_size(1400,900) 22 """ 23 24 wait = WebDriverWait(browser, 10) #显示等待10秒再查找目标元素 25 26 27 def search(): 28 # print('正在搜索') 用于phantomJS调试 29 try: 30 browser.get('https://www.taobao.com') 31 input1 = wait.until( 32 EC.presence_of_element_located((By.CSS_SELECTOR, '#q')) #定位搜索框 33 ) 34 submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_TSearchForm > div.search-button > button'))) #定位搜索按钮 35 # 这里的美食可以替换为配置文件中的变量KEYWORD 36 input1.send_keys('KEYWORD') 37 submit.click() 38 total = wait.until( 39 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.total'))) #定位总页数,使用右键copy selector的方法找出参数名 40 # 调用get_products 41 get_products() 42 return total.text 43 44 except TimeoutError: 45 return search() 46 47 48 # 使用翻页输入框来翻页 49 def next_page(page_number): 50 # print('正在翻页',page_number) 用于phantomJS调试 51 try: 52 input1 = wait.until( 53 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input')) 54 ) 55 submit = wait.until( 56 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')) 57 ) 58 input1.clear() 59 input1.send_keys(page_number) 60 submit.click() 61 # 根据选择页面会高亮这个条件,来判断是否成功跳转 62 wait.until(EC.text_to_be_present_in_element( 63 (By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > ul > li.item.active > span'), str(page_number))) 64 # 调用get_products() 65 get_products() 66 67 except TimeoutError: 68 next_page(page_number) 69 70 71 # 解析信息 72 def get_products(): 73 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-itemlist .items .item'))) 74 html = browser.page_source 75 doc = pq(html) 76 items = doc('#mainsrp-itemlist .items .item').items() 77 for item in items: 78 product = { 79 'image': item.find('.pic .img').attr('src'), 80 'price': item.find('.price').text(), 81 'deal': item.find('.deal-cnt').text()[:-3], 82 'title': item.find('.title').text(), 83 'shop': item.find('.shop').text(), 84 'location': item.find('.location').text() 85 } 86 print(product) 87 # 保存数据到mongodb 88 save_to_mongo(product) 89 90 91 # 定义一个保存到mongodb的方法 92 def save_to_mongo(result): 93 try: 94 if db[MON_TABLE].insert(result): 95 print('存储到MONGODB成功', result) 96 except Exception: 97 print('存储到MONGODB失败', result) 98 99 100 def main(): 101 try: 102 # 输出100数字 103 total = search() 104 total = int(re.compile('(d+)').search(total).group(1)) 105 # 调用翻页函数 106 for i in range(2, total + 1): 107 next_page(i) 108 except Exception: 109 print('出错了') 110 111 finally: 112 browser.close() 113 114 if __name__ == '__main__': 115 main()
2. config.py
1 __author__ = 'Administrator' 2 MONGO_URL = 'localhost' 3 MONGO_DB = 'taobao' 4 MON_TABLE = 'product' 5 6 # 配置phantomJS 7 SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] 8 KEYWORD = '美食'
方案2:上面这种方法经测试可正常运行,但是会一次性爬取全部数据,数据量较大且不能灵活控制抓取内容,下面代码基本实现方法如下
1. 把搜索的关键字直接放在url中
2. 分页抓取商品信息
3. 使用chrome的headless功能
1 import pymongo 2 from selenium import webdriver 3 from selenium.common.exceptions import TimeoutException 4 from selenium.webdriver.common.by import By 5 from selenium.webdriver.support import expected_conditions as EC 6 from selenium.webdriver.support.wait import WebDriverWait 7 from pyquery import PyQuery as pq 8 from config import * 9 from urllib.parse import quote 10 11 # browser = webdriver.Chrome() 12 # browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 13 14 chrome_options = webdriver.ChromeOptions() 15 chrome_options.add_argument('--headless') 16 browser = webdriver.Chrome(chrome_options=chrome_options) 17 18 wait = WebDriverWait(browser, 10) 19 client = pymongo.MongoClient(MONGO_URL) 20 db = client[MONGO_DB] 21 22 23 def index_page(page): 24 """ 25 抓取索引页 26 :param page: 页码 27 """ 28 print('正在爬取第', page, '页') 29 try: 30 url = 'https://s.taobao.com/search?q=' + quote(KEYWORD) 31 browser.get(url) 32 if page > 1: 33 #定位页码输入框 34 input = wait.until( 35 EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input'))) 36 ##定位页码跳转确定按钮 37 submit = wait.until( 38 EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit'))) 39 input.clear() 40 input.send_keys(page) 41 submit.click() 42 43 """ 44 验证是否跳转到对应的页码 45 只需要判断当前高亮的页码数是当前的页码数即可,可使用等待条件text_to_be_present_in_element,它会等待指定的文本出现在某一节点里面时即返回成功 46 我们将高亮的页面节点对应的css选择器和当前要跳转的页面作为这个等待条件的参数,那么这个等待条件就会检测此页码节点是否为指定的页码数 47 """ 48 wait.until( 49 EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page))) 50 51 #等待商品信息加载,选择器'.m-itemlist .items .item'对应的页面内容就是每个商品的信息,如果加载成功,执行get_products() 52 wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item'))) 53 get_products() 54 except TimeoutException: 55 index_page(page) 56 57 58 #解析商品列表 59 def get_products(): 60 """ 61 提取商品数据 62 """ 63 html = browser.page_source 64 doc = pq(html) 65 items = doc('#mainsrp-itemlist .items .item').items() 66 for item in items: 67 product = { 68 'image': item.find('.pic .img').attr('data-src'), 69 'price': item.find('.price').text(), 70 'deal': item.find('.deal-cnt').text(), #成交量 71 'title': item.find('.title').text(), 72 'shop': item.find('.shop').text(), 73 'location': item.find('.location').text() 74 } 75 print(product) 76 save_to_mongo(product) 77 78 79 def save_to_mongo(result): 80 """ 81 保存至MongoDB 82 :param result: 结果 83 """ 84 try: 85 if db[MONGO_COLLECTION].insert(result): 86 print('存储到MongoDB成功') 87 except Exception: 88 print('存储到MongoDB失败') 89 90 91 def main(): 92 """ 93 遍历每一页 94 """ 95 for i in range(1, MAX_PAGE + 1): 96 index_page(i) 97 browser.close() 98 99 100 if __name__ == '__main__': 101 main()
对应的配置文件如下
MONGO_URL = 'localhost' MONGO_DB = 'taobao' MONGO_COLLECTION = 'products' KEYWORD = 'ipad' MAX_PAGE = 100 SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']