zoukankan      html  css  js  c++  java
  • 基于pydpier爬取1药网(转载)

    1.商品爬取

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2019-02-02 08:59:40
    # Project: oneDrug
    
    from pyspider.libs.base_handler import *
    from pymongo import MongoClient
    import re
    
    
    class Handler(BaseHandler):
        crawl_config = {
        }
    
        def __init__(self):
            self.client = MongoClient('mongodb://localhost:27017')
            self.drug = self.client.drug
    
        def insert_goods(self, data):
            collection = self.drug['goods']
            collection.update({'goods_id': data['goods_id']}, data, True)
    
        def insert_comments(self, data):
            collection = self.drug['comments']
            collection.insert_one(data)
    
        @every(minutes=24 * 60)
        def on_start(self):
            self.crawl('https://www.111.com.cn/categories/', callback=self.categories_page, validate_cert=False,
                       fetch_type='js')
    
        @config(age=10 * 24 * 60 * 60)
        def categories_page(self, response):
            for each in response.doc('.allsort em > a').items():
                self.crawl(each.attr.href, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')
    
        @config(priority=1)
        def cagetory_list_page(self, response):
            for each in response.doc('#itemSearchList a[target="_blank"][class="product_pic pro_img"]').items():
                self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False, fetch_type='js')
            next = response.doc('#search_table > div.turnPageBottom > a.page_next').attr.href
            self.crawl(next, callback=self.cagetory_list_page, validate_cert=False, fetch_type='js')
    
        @config(priority=2)
        def detail_page(self, response):
            goods_id = response.doc('#gallery_view > ul > li.item_number').text()
            cagetory_one = response.doc('body > div.wrap.clearfix > div > span:nth-child(3) > a').text()
            cagetory_two = response.doc('body > div.wrap.clearfix > div > span:nth-child(5) > a').text()
            cagetory_three = response.doc('body > div.wrap.clearfix > div > span:nth-child(7) > a').text()
            merchants = response.doc('div.middle_property > span:nth-child(1)').text()
            goods_name = response.doc('div.middle_property > h1').text()
            goods_desc = response.doc('div.middle_property > span.red.giftRed').text()
            goods_price = response.doc(
                'div.middle_property > div.shangpin_info > dl:nth-child(2) > dd > span.good_price').text()
            total_comments = response.doc('#fristReviewCount > span > a').text()
    
            brand = response.doc(
                '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(2)').text()
            spec = response.doc(
                '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(2) > td:nth-child(4)').text()
            weight = response.doc(
                '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(2)').text()
            manufacturers = response.doc(
                '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(3) > td:nth-child(4)').text()
            approval_number = response.doc(
                '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(2)').text()
            drug_type = response.doc(
                '#tabCon > div:nth-child(1) > div.goods_intro > table > tbody > tr:nth-child(4) > td:nth-child(4)').text()
    
            instructions = {}
            if response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child(1) > th').text():
                for i in range(3, 22):
                    instructions_key = 
                    response.doc('#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > th'.format(i)).text().split(
                        " ")[0]
                    instructions_value = response.doc(
                        '#prodDetailCotentDiv > table > tbody > tr:nth-child({}) > td'.format(i)).text()
                    instructions[instructions_key] = instructions_value
    
            total_comments = response.doc('#itemComments > span').text()
            good_comments = response.doc('#productExperience > div > ul > li:nth-child(2) > a > span').text()
            mid_comments = response.doc('#productExperience > div > ul > li:nth-child(3) > a > span').text()
            bad_comments = response.doc('#productExperience > div > ul > li:nth-child(4) > a > span').text()
    
            url_id = re.findall('d+', response.url)[1]
    
            goods_data = {
                'url_id': url_id,
                'goods_id': goods_id,
                'goods_name': goods_name,
                'goods_desc': goods_desc,
                'goods_price': goods_price,
                'merchants': merchants,
                'cagetory': {
                    '1': cagetory_one,
                    '2': cagetory_two,
                    '3': cagetory_three
                },
                'drug_detail': {
                    'brand': brand,
                    'spec': spec,
                    'weight': weight,
                    'manufacturers': manufacturers,
                    'approval_number': approval_number,
                    'drug_type': drug_type
                },
                'instructions': instructions,
                'comments': {
                    'total_comments': total_comments,
                    'good_comments': good_comments,
                    'mid_comments': mid_comments,
                    'bad_comments': bad_comments
                }
            }
            self.insert_goods(goods_data)

    2.评论爬取

    from pymongo import MongoClient
    import requests
    from bs4 import BeautifulSoup
    import re
    import socket
    
    
    class Drug:
        def __init__(self):
            self.clint = MongoClient('mongodb://localhost:27017')
            self.drug = self.clint.drug
            self.collection = self.drug['goods']
            self.comm_collection = self.drug['comments']
    
        def dbmodify(self):
            for data in self.collection.find({},{"goods_id":1,"goods_price":1}):
                try:
                    _id = data['_id']
                    id = data['goods_id'].split("")[1]
                    price = data['goods_price'].split("")[1]
                    self.collection.update({'_id': _id},{'$set':{'goods_id':id,'goods_price':price}})
                    print(_id, id, price)
                except IndexError:
                    pass
    
    
    
        def getBaseArgument(self,goods_id):
            base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'
            data = {
                'goodsId': goods_id,
                'pageIndex': 1,
                'score': '1&_19020301'
            }
            try:
                self.collection.update_one({'url_id': goods_id}, {'$set': {'commspider': True}})
                requests.packages.urllib3.disable_warnings()
                requests.adapters.DEFAULT_RETRIES = 5
                # 设置连接活跃状态为False
                s = requests.session()
                s.keep_alive = False
                r = s.get(base_url, params=data, timeout = 5,verify=False)
                r.close()
                soup = BeautifulSoup(r.text, 'html.parser')
                if soup.find_all("div", class_="view_no_result"):
                    return "No Comments!"
                else:
                    total_page_text = soup.find_all(text=re.compile(r'共d+页'))[0]
                    pattern = re.compile(r'd+')
                    total_page = pattern.findall(total_page_text)
                    return total_page[0]
            except requests.exceptions.RequestException as e:
                print(e)
    
        def getCommlist(self,goods_id, total_page):
            base_url = 'https://www.111.com.cn/interfaces/review/list/html.action'
            try:
                for i in range(1, int(total_page)):
                    data = {
                        'goodsId': goods_id,
                        'pageIndex': i,
                        'score': '1&_19020301'
                    }
                    try:
                        requests.packages.urllib3.disable_warnings()
                        requests.adapters.DEFAULT_RETRIES = 15
                        # 设置连接活跃状态为False
                        s = requests.session()
                        s.keep_alive = False
                        r = s.get(base_url, params=data, timeout = 5,verify=False)
                        r.close()
                        soup = BeautifulSoup(r.text, 'html.parser')
                        for tr in soup.find_all("tr"):
                            comments = {}
                            try:
                                comments['goodsId'] = goods_id
                                comments['content'] = tr.find('p').text.strip()
                                comments['date'] = tr.find('p', attrs={'class': 'eval_date'}).text.strip()
                                self.comm_collection.insert_one(comments)
                            except:
                                print(goods_id + "Have some problem!
    ")
                            print(comments)
                    except requests.exceptions.RequestException as e:
                        print(e)
            except ValueError:
                return "No Comments! Try next!"
    
        def getComments(self):
            i = 0
            goods_list = []
            for data in self.collection.find({'commspider': False}, {"url_id"}):
                id = data['url_id']
                goods_list.append(id)
            length = len(goods_list)
            print("总共 {} 条商品".format(length))
            for good in goods_list:
                total_page = self.getBaseArgument(good)
                comments = self.getCommlist(good,total_page)
                i = i + 1
                print("总共 {} 条商品
    目前第 {} 条
    商品编号 {} 
    ".format(length,i, good))
                print(comments)
    
    
    test = Drug().getComments()
  • 相关阅读:
    查询BLOB字段的长度
    java中使用公钥加密私钥解密原理实现license控制
    Eclipse调试Bug的七种常用技巧
    mysql视图
    动态缓存技术之CSI,SSI,ESI
    取得图片原来的大小
    fieldset也是表单元素
    isInt
    取得浏览器的文档类型
    option的value、text与label
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10611624.html
Copyright © 2011-2022 走看看