zoukankan      html  css  js  c++  java
  • 爬起点小说day03

    # 把所有类别的前3页的小说爬取下来

    import scrapy
    from scrapy.http import Request
    from time import sleep
    from qidianNovel.spiders.connectionSQL import getredis,getMongodb
    # 把起点首页的所有列表,起点是最后两页没有下一页(此处当做一页)
    class spider_list_novel(scrapy.Spider):
    name = "spider_list_novel" #要调用的名字
    allowed_domains = ["qidian.com"] #分一个域
    start_urls = []
    dict = {}
    red = getredis()
    mongodb=getMongodb('novel','novels')
    def __init__(self):
    urls = self.red.lrange('bnovel_all_list', 0, -1)
    for url in urls:
    url = str(url, encoding="utf-8")
    url = url.split(',')
    spider_list_novel.start_urls.append(url[2])
    spider_list_novel.dict[url[2]] ={'classId':url[0],'listId':url[1],'sum':0}
    # break
    #每爬完一个网页会回调parse方法
    def parse(self, response):
    print(response.url)
    Pid = self.dict[response.url]
    Pid['sum']+=1
    print(Pid['sum'])
    if Pid['sum']>3:
    return
    links = response.xpath('//div[@class="book-mid-info"]/h4/a')
    for link in links:
    novel_name = link.select("text()").extract()[0]
    novel_id = self.mongodb.insert({'name': novel_name, 'total_list': Pid['classId'], 'list': Pid['listId']})
    href = link.select("@href").extract()[0]
    href = str(novel_id) + ',' + 'https:' + href
    print(href)
    self.red.lpush('all_novel_href',href)
    sleep(0.3)
    href=self.find_next(response)
    if href==None:
    f = open('file/%s.txt' % ("日志"), 'a', encoding='utf-8')
    f.write(response.url)
    f.write('++++++++++++++')
    f.close()
    else:
    href="https:"+href
    if href.find('javascript:;')<0:
    self.dict[href] = Pid
    request=Request(href,callback=self.parse)
    yield request
    def find_next(self,response):
    try:
    hrefs =response.xpath('//li[@class="lbf-pagination-item"]/a')
    i=len(hrefs)
    href=hrefs[i - 1].select("@href").extract()[0]
    return href
    except Exception as err:
    f = open('file/%s.txt' % ("日志"), 'a', encoding='utf-8')
    f.write(str(err)+':'+href)
    f.close()
    return None

  • 相关阅读:
    函数
    python操作文件
    POJ-2689-Prime Distance(素数区间筛法)
    POJ-2891-Strange Way to Express Integers(线性同余方程组)
    POJ-2142-The Balance
    POJ-1061-青蛙的约会(扩展欧几里得)
    Educational Codeforces Round 75 (Rated for Div. 2) D. Salary Changing
    Educational Codeforces Round 75 (Rated for Div. 2) C. Minimize The Integer
    Educational Codeforces Round 75 (Rated for Div. 2) B. Binary Palindromes
    Educational Codeforces Round 75 (Rated for Div. 2) A. Broken Keyboard
  • 原文地址:https://www.cnblogs.com/qieyu/p/7994874.html
Copyright © 2011-2022 走看看