zoukankan      html  css  js  c++  java
  • selenium+PhantomJS 抓取淘宝搜索商品

    最近项目有些需求,抓取淘宝的搜索商品,抓取的品类还多。直接用selenium+PhantomJS 抓取淘宝搜索商品,快速完成。

    #-*- coding:utf-8 -*-
    __author__ =''
    import logging
    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import time,re
    from tqdm import tqdm
    from pyquery import PyQuery as pq
    from tianmao.data_tmall import keywords
    from dbutils import mysql_util
    from config import retry_count
    """
    抓取天猫,
    """
    logging.basicConfig(level=logging.INFO,
    format='%(asctime)s [%(levelname)s] [%(filename)s] [%(threadName)s] [line:%(lineno)d] [%(funcName)s] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S')


    class tianmao_spider(object):

    def __init__(self):
    self.SERVICE_ARGS = ['--disk-cache=true','--load-images=false']
    self.target_url ='https://www.tmall.com/'
    self.browser = webdriver.PhantomJS(service_args=self.SERVICE_ARGS)
    self.wait = WebDriverWait(self.browser, 10) #设置10秒超时
    self.browser.set_window_size(1400, 900)
    # self.browser.add_cookie()
    self.mysql_util = mysql_util()

    def search(self, category, keyword, page=2):
    print('正在搜索:{0}'.format(keyword))
    total = 0
    for i in range(0, retry_count): #重试3次,3次不成功则跳过
    try:
    self.browser.get(self.target_url)
    input = self.wait.until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "#mq"))
    )
    submit = self.wait.until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, '#mallSearch > form > fieldset > div > button')))
    input.send_keys(keyword)
    submit.click()
    total = self.wait.until(
    EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form')))
    if page == 2: # 从首页开始抓取
    self.get_products(category, keyword)
    total = int(re.compile('(d+)').search(total[0].text).group(1))
    break
    except TimeoutException as e:
    logging.info("正在重试第{0}次,出现:{1}".format(i + 1, e))
    if i == retry_count - 1: # 将类目和搜索关键字按字典写入txt
    f = open('tmall_retry_crawl.txt', 'a', encoding='utf-8')
    f.write(category + ":" + keyword + ' ')
    time.sleep(1)
    return total

    def get_products(self, category, keyword):
    self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#J_ItemList')))
    html = self.browser.page_source
    doc = pq(html)
    items = doc('#J_ItemList .product').items()
    for item in items:
    #下载图片
    img_url = item.find('img').attr('src') if item.find('img').attr('src') != None else item.find(
    'img').attr('data-src')
    if img_url != None:
    if not str(img_url).startswith("http"):
    img_url = 'http:{0}'.format(img_url)
    img_save_path = '' #IMG_PATH + '/{0}.jpg'.format(uuid.uuid4())
    #download_img(img_url, img_save_path)
    #获取详情页面
    item_url = item.find('a').attr('href')
    logging.info('详情页面url:{0}'.format(item_url))
    if item_url == None:
    item_detail = ''
    else:
    if not str(item_url).startswith('http'):
    item_url = "https:" + item_url
    # item_detail = get_item_detail(item_url)
    item_detail = ''
    #保存到MySQL
    product = {
    'target': 'tmall',
    'category': category,
    'keyword': keyword,
    'item_url': item_url,
    'image_url': img_url,
    'image_save_path': img_save_path, #div:nth-child(3) >
    'title': item.find('div > div.productTitle').text(),
    'price': item.find('div > p.productPrice').text(),
    'deal': item.find('div > p.productStatus').text().replace('阿里旺旺','').strip(),
    'shop': item.find('div > div.productShop').text(),
    'location': '',
    'item_detail': item_detail,
    'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    }
    self.mysql_util.sava_to_mysql('t_tmall', product)


    def next_page(self, page_number, category, keyword):
    for i in range(0, retry_count): #重试3次
    try:
    input = self.wait.until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "#content > div > div.ui-page > div > b.ui-page-skip > form > input.ui-page-skipTo"))
    )
    submit = self.wait.until(EC.element_to_be_clickable(
    (By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > button')))
    input.clear()
    input.send_keys(page_number)
    submit.click()
    self.wait.until(EC.element_to_be_clickable(
    (By.CSS_SELECTOR, '#content > div > div.ui-page > div > b.ui-page-skip > form > button')))
    self.get_products(category, keyword)
    break
    except TimeoutException as e:
    logging.info(e)
    time.sleep(0.5)
    # self.next_page(page_number, category, keyword)

    def start_crawler(self, page=2):
    try:
    for category in keywords.keys():
    keyword_list = keywords[category]
    for keyword in keyword_list:
    total = self.search(category, keyword, page=page)
    # total = int(re.compile('(d+)').search(total).group(1))
    for i in tqdm(range(page, total + 1)):
    print('总共{0}页,正在翻第{1}页,抓取类别:{2},搜索关键字:{3}'.format(total, i, category, keyword))
    self.next_page(i, category, keyword)
    if page != 2: # 下一个产品必须从第二页开始抓取,中断后可以直接从中断页继续抓取
    page = 2
    time.sleep(0.5)
    except Exception as e:
    print(e)
    finally:
    self.browser.close()


    if __name__ == '__main__':
    tmall = tianmao_spider()
    tmall.start_crawler()
  • 相关阅读:
    Java三年经验
    系统集成项目管理 - 笔记
    ZK
    older versions of the JRE and JDK
    [提高组集训2021] 古老的序列问题
    CF1556G Gates to Another World
    Codeforces Round #743 (Div. 1)
    [提高组集训2021] 蚂蚁
    [LOJ 6669] Nauuo and Binary Tree
    [ABC219H] Candles
  • 原文地址:https://www.cnblogs.com/hd-zg/p/8412693.html
Copyright © 2011-2022 走看看