zoukankan      html  css  js  c++  java
  • Python爬虫:爬取糗事百科

    网上看到的教程,但是是用正则表达式写的,并不能运行,后面我就用xpath改了,然后重新写了逻辑,并且使用了双线程,也算是原创了吧
    #!/usr/bin/python
    # -*- encoding:utf-8 -*-


    from lxml import etree
    from multiprocessing.dummy import Pool as ThreadPool
    import requests
    import sys
    #编码
    reload(sys)
    sys.setdefaultencoding('utf-8')

    #定义输出函数
    def towrite(contentdict):
    f.writelines(u'作者:' + contentdict['author'] + ' ')
    f.writelines(u'内容:' + contentdict['content'] + ' ')
    f.writelines(u'好笑:' + contentdict['vote'] + ' ')
    f.writelines(u'评论:' + contentdict['span'] + ' ')

    def spider(url):
    #得到页面代码
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    html = requests.get(url,headers=headers)

    #获取内容
    selector = etree.HTML(html.text)
    content_field = selector.xpath('//*[@id="content-left"]/div[@class="article block untagged mb15"]')
    item={}
    for i in range(len(content_field)):
    #作者
    author_f= content_field[i].xpath('div[@class="author clearfix"]')[0]
    author=author_f.xpath('string(.)').replace(' ','').replace(' ','')
    #内容
    content_f=content_field[i].xpath('div[@class="content"]/text()')
    content=''
    for n in range(len(content_f)):
    content_temp=content_f[n].replace(' ','').replace(' ','').replace(' ','')
    content+=str(content_temp)

    #好笑
    vote=''
    vote_temp= content_field[i].xpath('div[@class="stats"]/span[@class="stats-vote"]/i/text()')[0]
    vote+=str(vote_temp)

    #评论,如果评论为空,则不会显示i节点
    span=''
    span_temp_l= content_field[i].xpath('div[@class="stats"]/span[@class="stats-comments"]/a/i/text()')
    span_temp=[]
    if len(span_temp_l)>0:
    span_temp=span_temp_l[0]
    else:
    span_temp='0'
    span+=str(span_temp)

    item['author'] = author
    item['content'] = content
    item['vote'] = vote
    item['span'] = span

    towrite(item)


    if __name__ == '__main__':
    pool = ThreadPool(4)
    f = open('content.txt','a')
    url = []
    for i in range(1,36):
    newpage = 'http://www.qiushibaike.com/hot/page/' + str(i)
    url.append(newpage)

    results = pool.map(spider, url)
    pool.close()
    pool.join()
    f.close()

  • 相关阅读:
    五、异步任务编排CompletableFuture
    四、fork/join框架
    三、阻塞等待异步结果FutureTask
    二、synchronized同步锁
    一、Java多线程基础
    6. ZigZag Conversion (字符串的连接)
    5. Longest Palindromic Substring (DP)
    4. Median of Two Sorted Arrays (二分法;递归的结束条件)
    3. Longest Substring Without Repeating Characters (ASCII码128个,建立哈西表)
    2. Add Two Numbers
  • 原文地址:https://www.cnblogs.com/miranda-tang/p/5508368.html
Copyright © 2011-2022 走看看