zoukankan      html  css  js  c++  java
  • 爬取个人随笔内容——练手,待补充

    import requests,lxml
    from bs4 import BeautifulSoup
    url='https://www.cnblogs.com/wjlv/default.html?page=2' # 打开网页
    html_index = requests.get(url).text # 获取请求内容
    soap = BeautifulSoup(html_index,"lxml")
    a_list = soap.find_all('a',{"class":"postTitle2"}) # 得到所有随笔标题和地址标签
    for h in a_list:
    soap = BeautifulSoup(str(h),'lxml')
    # print('{}:{}'.format(soap.find('a').text,soap.find('a')['href'])) # 得到单个随笔的地址
    article = requests.get(soap.find('a')['href']).text # 得到单个随笔的内容
    soap_a = BeautifulSoup(article,'lxml')
    p_lab = soap_a.find_all('p') # 获取单个随笔内容标签
    for txt in p_lab:
    soap_t = BeautifulSoup(str(txt),'lxml')
    print(soap_t.find('p').text) # 得到所有标签的具体内容
    拿书旗小树网站的一片小说练练手:
    还是有很多地方需要优化:
    1、重复提取内容的步骤较多
    2、提取效率较为低下,多线程
    3、小说获取较单一、固定
     1 import lxml, requests, os
     2 from bs4 import BeautifulSoup
     3 
     4 url = 'http://book.txtbook.com.cn/shu/6478/chapterlist.html'
     5 response = requests.get(url).text  # 请求小说章节总览页
     6 soap = BeautifulSoup(response, 'lxml')
     7 title_list = soap.find('div', {"class": "t_list6"})
     8 soap_t = BeautifulSoup(str(title_list), 'lxml')
     9 href_list = soap_t.find_all('a')
    10 for href in href_list:
    11     soap_h = BeautifulSoup(str(href), 'lxml')
    12     content_url, content_title = soap_h.find('a', {'class': 'nocur'})['href'], soap_h.find('a').text  # 获取章节名称和url
    13     content_response = requests.get(content_url).text
    14     soap_r = BeautifulSoup(content_response, 'lxml')
    15     content_p = soap_r.find_all('div', {'id': 'chaptercontent'})  # 多属性确定内容位置
    16     for content in content_p:
    17         soap_c = BeautifulSoup(str(content), 'lxml')
    18         # print(soap_c.find('p'))
    19         if not os.path.exists(r"E:python_project1dayooks"):  # 创建存放目录
    20             os.makedirs(r'../books/')
    21         with open(r'../books/' + content_title + '.txt', 'w+',encoding='utf8') as f:
    22             f.writelines(soap_c.find('p').text.split("<br/><br/>"))  # 写入章节内容
    23         print('%s has already download' %content_title)
    View Code

    爬取个人博客:

     1 import re, os, requests
     2 
     3 
     4 def get_url(url, pattern=None):
     5     try:
     6         response = requests.get(url)
     7         section_info = []
     8         if response.status_code == 200:
     9             if pattern is not None:
    10                 p_section = re.compile(pattern, re.S)  # r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>'
    11                 section_info = re.findall(p_section, response.text)
    12             return section_info
    13         else:
    14             return None
    15     except Exception as e:
    16         return e
    17 
    18 def write_content(content,section):
    19     if not os.path.exists(r'../blogs/'):
    20         os.makedirs(r'../blogs/')
    21     try:
    22         with open(r'../blogs/'+ section.strip().replace('.','_').replace('——','_').replace(' ','') + '.txt','w+') as f:
    23             f.write(str(content).strip(']['))
    24             print('%s 下载完成……' %section)
    25     except IOError:
    26         print('%s 下载失败……' %section)
    27 
    28 def get_content(section_list, pattern):
    29     for section in section_list:
    30         content_list = get_url(section[0], pattern)
    31         content = []
    32         for p in content_list:
    33             p = p.strip()
    34             content.append(p)
    35         write_content(content,section[1])
    36 
    37 
    38 if __name__ == '__main__':
    39     url = 'https://www.cnblogs.com/wjlv/'
    40     section_list = get_url(url, r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>')
    41     get_content(section_list, r'<p>(.*?)</p>')
    View Code
  • 相关阅读:
    Python eval 函数妙用
    day19 装饰器
    Struts08---全局结果和全局异常的配置
    Struts07---访问servlet的API
    Struts06---通配符的使用
    Struts05---动态查询
    Struts04---命名空间的查询顺序以及默认执行的Action
    Struts03---参数传递
    Struts02---实现struts2的三种方式
    struts2文件上传和下载
  • 原文地址:https://www.cnblogs.com/wjlv/p/11413542.html
Copyright © 2011-2022 走看看