zoukankan      html  css  js  c++  java
  • 晋江年下文爬取【xpath】

    '''
    @Modify Time      @Author   目标:晋江年下文 爬取6页
    ------------      ------- http://www.jjwxc.net/search.php?kw=%C4%EA%CF%C2&t=1&p=1
    2019/8/31 15:19   laoalo
    '''
    
    import requests
    from lxml import etree
    
    head = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
        'Host':'www.jjwxc.net',
    }
    
    def get_page_detail(url):
        # 得到当前页面中每本书的url
        response = requests.get(url=url,headers=head,timeout=50).text
        book = etree.HTML(response).xpath('//h3[@class="title"]/a/@href')
        # print(etree.tostring(book,encoding='gbk').decode('gbk'))
        return book
    
    def get_book_detial(book_url):
        # 得到每本书的详细信息
        try:
            response = requests.get(url=book_url, headers=head, timeout=50).text
            book_detail = etree.HTML(response)
            book = {}
    
            title = book_detail.xpath("//span[@itemprop='articleSection']//text()")[0].encode('ISO-8859-1').decode('gbk')
            book['title'] = title
            author = book_detail.xpath("//span[@itemprop='author']//text()")[0].encode('ISO-8859-1').decode('gbk')
            book['author'] = author
            information = book_detail.xpath("string(//div[@id='novelintro'])").encode('ISO-8859-1').decode('gbk')
            book['information'] = information
    
            return book
        except IndexError as e:
            print(e,'下标越界')
    
        # targets = book_detail.xpath("//text()")
        # for index,target in enumerate(targets):
        #     print(index,'*'*30,target.encode('ISO-8859-1').decode('gbk'))
        # '''
        # 标签爬不出来
        # '''
    
    
    def spider():
        bookshelf = []
        for i in range(1,5):
            print("这是第{index}页的信息
    
    
    ".format(index=i))
            url = 'http://www.jjwxc.net/search.php?kw=%C4%EA%CF%C2&t=1&p={page_num}'.format(page_num=i)
            book_list = get_page_detail(url)
            for i in book_list:
                # print(get_book_detial(i))
                bookshelf.append(get_book_detial(i))
        return bookshelf
            # print(url)
    if __name__ == '__main__':
        # print(get_book_detial("http://www.jjwxc.net/onebook.php?novelid=3402626"))
        print(spider())

    时常会有:"list index out of range 下标越界",或是"TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败",有的师傅说是因为访问过快,导致list的赋值没附上……代码有待优化

  • 相关阅读:
    C#中使用事务
    C#中执行数据库存储过程
    构建ASP.net的AJAX开发环境
    C#开发数据库技巧汇总
    索引的作用及其使用
    C#中的多态性
    C#中调用C++的DLL
    不借助其它变量交换两变量值
    ASP.NET页面间传值的9种方式
    TERSUS无代码开发(笔记11)TERSUS框架学习框架基本信息修改
  • 原文地址:https://www.cnblogs.com/chrysanthemum/p/11449324.html
Copyright © 2011-2022 走看看