zoukankan html css js c++ java

python爬虫：爬取读者某一期内容

学会了怎么使用os模块

#!/usr/bin/python
# -*- encoding:utf-8 -*-

import requests
import os
from bs4 import BeautifulSoup

def urlBS(url):
    response = requests.get(url)
    # print response.encoding     #查看request解析的网页的编码
    response.encoding = 'utf-8'     #requests自动识别的编码有误，强制更改编码方式
    soup = BeautifulSoup(response.text,'lxml')  #前面用的soup = BeautifulSoup(response)一直报错，后面参考用etree的方式，我加了个.text搞定
    # print soup.original_encoding    #查看BeautifulSoup解析的网页的编码
    return soup

def get_article(url):
    #得到文档的链接
    booklist_soup=urlBS(url)
    herf_list=booklist_soup.select('.booklist a')

    path = os.getcwd()+u'/读者文章保存'+time+u'/'  #取得当前目录
    if not os.path.isdir(path):         #如果当前目录不存在读者文章保存这个文件夹，则新建它
        os.mkdir(path)

    for herf in herf_list:
        newurl=baseurl+herf['href']
        # print newurl
        result=urlBS(newurl)

        title=result.find("h1").string  #获得文章标题
        filename = path + title + '.txt'
        #print filename
        author=result.find(id='pub_date').string.strip() #获得作者,strip去除前后空格
        print filename+'   '+author
        #写入文档
        new=open(filename,"w")
        new.write("<<" + title.encode("utf-8") + ">>

")
        new.write(author.encode("utf-8")+"

")
        #文章信息
        text=result.select(".blkContainerSblkCon p")
       # file=open('testfile.text','w')
        for p in text:
            # print p.text
            #content=p.string.strip() #用strip导致不会换行了，写进去后所有的资料都变成了一行
            content=p.text              #直接用.text原文的换行都会存在，保存较完好
            new.write(content.encode("utf-8"))
        new.close()

查看全文

相关阅读:
java中的HMAC-SHA1加密
 java拦截处理System.exit(0)
使用canal分析binlog(二) canal源码分析
 JS的异步世界
 socket.io的用户认证
 一个补零小函数
 使用gulp在开发过程中合理导出zip文件
 使用r.js进行前端repuirejs的合并压缩
 使用r2d3的注意事项
 三列自适应布局的实现方式(兼容IE6+)

原文地址：https://www.cnblogs.com/miranda-tang/p/5508417.html