zoukankan html css js c++ java

Python爬虫：爬取糗事百科

网上看到的教程，但是是用正则表达式写的，并不能运行，后面我就用xpath改了，然后重新写了逻辑，并且使用了双线程，也算是原创了吧
#!/usr/bin/python
# -*- encoding:utf-8 -*-


from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import sys
#编码
reload(sys)
sys.setdefaultencoding('utf-8')

#定义输出函数
def towrite(contentdict):
    f.writelines(u'作者:' + contentdict['author'] + '
')
    f.writelines(u'内容:' + contentdict['content'] + '
')
    f.writelines(u'好笑:' + contentdict['vote'] + '
')
    f.writelines(u'评论:' + contentdict['span'] + '

')

def spider(url):
    #得到页面代码
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    html = requests.get(url,headers=headers)

    #获取内容
    selector = etree.HTML(html.text)
    content_field = selector.xpath('//*[@id="content-left"]/div[@class="article block untagged mb15"]')
    item={}
    for i in range(len(content_field)):
        #作者
        author_f= content_field[i].xpath('div[@class="author clearfix"]')[0]
        author=author_f.xpath('string(.)').replace('
','').replace(' ','')
        #内容
        content_f=content_field[i].xpath('div[@class="content"]/text()')
        content=''
        for n in range(len(content_f)):
            content_temp=content_f[n].replace('
','').replace(' ','').replace('	','')
            content+=str(content_temp)

        #好笑
        vote=''
        vote_temp= content_field[i].xpath('div[@class="stats"]/span[@class="stats-vote"]/i/text()')[0]
        vote+=str(vote_temp)

        #评论,如果评论为空，则不会显示i节点
        span=''
        span_temp_l= content_field[i].xpath('div[@class="stats"]/span[@class="stats-comments"]/a/i/text()')
        span_temp=[]
        if len(span_temp_l)>0:
            span_temp=span_temp_l[0]
        else:
            span_temp='0'
        span+=str(span_temp)

        item['author'] = author
        item['content'] = content
        item['vote'] = vote
        item['span'] = span

        towrite(item)


if __name__ == '__main__':
    pool = ThreadPool(4)
    f = open('content.txt','a')
    url = []
    for i in range(1,36):
        newpage = 'http://www.qiushibaike.com/hot/page/' + str(i)
        url.append(newpage)

    results = pool.map(spider, url)
    pool.close()
    pool.join()
    f.close()

查看全文

相关阅读:
五、异步任务编排CompletableFuture
四、fork/join框架
 三、阻塞等待异步结果FutureTask
二、synchronized同步锁
 一、Java多线程基础
 6. ZigZag Conversion （字符串的连接）
5. Longest Palindromic Substring （DP）
4. Median of Two Sorted Arrays (二分法；递归的结束条件）
3. Longest Substring Without Repeating Characters （ASCII码128个，建立哈西表）
2. Add Two Numbers

原文地址：https://www.cnblogs.com/miranda-tang/p/5508368.html