zoukankan      html  css  js  c++  java
  • 学习pyspider两篇记录,python爬虫

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-11-08 09:56:40
    # Project: product

    from pyspider.libs.base_handler import *
    import re
    import base64
    import os
    import urllib
    import urllib.request
    import requests
    import json


    class Handler(BaseHandler):

    def default(self, obj):
    if isinstance(obj, bytes):
    return str(obj, encoding='utf-8')
    return json.JSONEncoder.default(self, obj)

    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
    self.crawl('http://www.yunjinet.com/sell/list/7934/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
    for each in response.doc('a[href^="http"]').items():
    if re.match('http://www.yunjinet.com/sell/show.+',each.attr.href):
    self.crawl(each.attr.href, callback=self.detail_page)
    else:
    self.crawl(each.attr.href, callback=self.index_page)


    @config(priority=2)
    def detail_page(self, response):
    image_url_list=[]#图片url
    tags=[]#分类标签
    images=[]#图片base64
    x=0
    imageresult=[]#放图片对象
    results=[]#最终结果,全部json放里
    result=dict()#放json
    headers = {"Content-Type": "application/json"}
    path='D:\pythonlianxi\testimg'

    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path+'\'

    for img in response.doc('div[class="vertical-img"] img').items():
    image_url_list.append(img.attr.src)
    urllib.request.urlretrieve(img.attr.src,'{0}{1}.jpg'.format(paths,x))
    #print(paths+str(x))
    with open(paths+str(x)+".jpg","rb") as f:
    base64_data = base64.b64encode(f.read())
    base64_data = (base64_data).decode()
    #print((base64_data).decode())
    #print("".join(map(chr, base64_data)))
    images.append(base64_data)
    imgurl=dict()#放base64
    imgurl['imgBase64']=base64_data
    imageresult.append(imgurl)
    x = x + 1


    for each in response.doc('div[class="location_an mt_10"]').items('a'):
    tags.append(each.text())



    pricebefore=response.doc('p[class="s"]').text()
    findlist = re.findall('[0-9]*.?[0-9]+', pricebefore)
    if not len(findlist):
    findlist=[0]
    print(findlist[0])
    result['originalLink']=response.url
    result['productName']=response.doc('h1').text()
    result['price']=findlist[0]
    result['productDescription']=response.doc('div[class="product_content"]').text()
    result['category1']=tags[2]
    result['category2']=tags[3]
    result['category3']=tags[4]
    result['images']=imageresult

    results.append(result)
    print(result)

    #payload=json.dumps(result)
    payload=json.dumps(result)
    r = requests.post('http://192.168.1.115/es/index/product', data=payload, headers=headers)

    return {
    "originalLink": response.url,
    "productName": response.doc('h1').text(),
    "price": response.doc('p[class="s"]').text(),
    "productDescription":response.doc('div[class="product_content"]').text(),
    "category1":tags[2],
    "category2":tags[3],
    "category3":tags[4],
    "images":images,
    }

    -------------------------------

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-11-08 09:56:40
    # Project: product

    from pyspider.libs.base_handler import *
    import re
    import base64
    import os
    import urllib
    import urllib.request
    import requests
    import json


    class Handler(BaseHandler):

    def default(self, obj):
    if isinstance(obj, bytes):
    return str(obj, encoding='utf-8')
    return json.JSONEncoder.default(self, obj)

    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
    self.crawl('https://product.suning.com/0000000000/10629204175.html#?safp=d488778a_10004_0_daa73474ac', callback=self.index_page, validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
    for each in response.doc('a[href^="http"]').items():
    if re.match('https://product.suning.com/+',each.attr.href):
    self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
    else:
    self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)


    @config(priority=2)
    def detail_page(self, response):
    image_url_list=[]#图片url
    tags=[]#分类标签
    images=[]#图片base64
    x=0
    imageresult=[]#放图片对象
    results=[]#最终结果,全部json放里
    result=dict()#放json
    #headers = {"Content-Type": "application/json"}
    path='D:\pythonlianxi\testimg'

    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path+'\'

    for img in response.doc('div[moduleId="R1901001_3"]').items('img'):

    if re.match('http.+',img.attr.src2):
    imgurl = img.attr.src2
    else:
    imgurl = 'https://'+img.attr.src2
    # image_url_list.append(img.attr.src)
    urllib.request.urlretrieve(imgurl,'{0}{1}.jpg'.format(paths,x))
    with open(paths+str(x)+".jpg","rb") as f:
    base64_data = base64.b64encode(f.read())
    base64_data = (base64_data).decode()
    # #print((base64_data).decode())
    # #print("".join(map(chr, base64_data)))
    #images.append(base64_data)
    imgurl=dict()#放base64
    imgurl['imgBase64']=base64_data
    imageresult.append(imgurl)
    x = x + 1


    for each in response.doc('a[class="ft"]').items():
    tags.append(each.text())

    #pricebefore=response.doc('p[class="s"]').text()
    #findlist = re.findall('[0-9]*.?[0-9]+', pricebefore)
    #print(findlist[0])
    #if not len(findlist):
    # findlist=[0]


    result['originalLink']=response.url
    result['productName']=response.doc('h1').text()
    result['price']=3000
    result['productDescription']=response.doc('meta[name="description"]').attr.content
    result['category1']=tags[0]
    result['category2']=tags[1]
    result['category3']=tags[2]
    result['images']=imageresult

    #results.append(result)
    #print(result)

    #payload=json.dumps(result)
    #r = requests.post('http://192.168.1.115/es/index/product', data=payload, headers=headers)

    return {
    "originalLink": response.url,
    "productName": response.doc('h1').text(),
    #"price": response.doc('p[class="s"]').text(),
    "productDescription":response.doc('meta[name="description"]').attr.content,
    "category1":tags[0],
    "category2":tags[1],
    "category3":tags[2],
    "images":imageresult,
    }

  • 相关阅读:
    新概念第二册(1)--英语口语听力课1
    外企面试课程(一)---熟悉常见的缩略词
    公司 邮件 翻译 培训 长难句 结课
    workflow
    公司 邮件 翻译 培训 长难句 20
    公司 邮件 翻译 培训 长难句 19
    Engineering Management
    公司 邮件 翻译 培训 长难句 18
    公司 邮件 翻译 培训 长难句 17
    第14.5节 利用浏览器获取的http信息构造Python网页访问的http请求头
  • 原文地址:https://www.cnblogs.com/lely/p/9936455.html
Copyright © 2011-2022 走看看