zoukankan      html  css  js  c++  java
  • 爬取个人随笔内容——练手,待补充

    import requests,lxml
    from bs4 import BeautifulSoup
    url='https://www.cnblogs.com/wjlv/default.html?page=2' # 打开网页
    html_index = requests.get(url).text # 获取请求内容
    soap = BeautifulSoup(html_index,"lxml")
    a_list = soap.find_all('a',{"class":"postTitle2"}) # 得到所有随笔标题和地址标签
    for h in a_list:
    soap = BeautifulSoup(str(h),'lxml')
    # print('{}:{}'.format(soap.find('a').text,soap.find('a')['href'])) # 得到单个随笔的地址
    article = requests.get(soap.find('a')['href']).text # 得到单个随笔的内容
    soap_a = BeautifulSoup(article,'lxml')
    p_lab = soap_a.find_all('p') # 获取单个随笔内容标签
    for txt in p_lab:
    soap_t = BeautifulSoup(str(txt),'lxml')
    print(soap_t.find('p').text) # 得到所有标签的具体内容
    拿书旗小树网站的一片小说练练手:
    还是有很多地方需要优化:
    1、重复提取内容的步骤较多
    2、提取效率较为低下,多线程
    3、小说获取较单一、固定
     1 import lxml, requests, os
     2 from bs4 import BeautifulSoup
     3 
     4 url = 'http://book.txtbook.com.cn/shu/6478/chapterlist.html'
     5 response = requests.get(url).text  # 请求小说章节总览页
     6 soap = BeautifulSoup(response, 'lxml')
     7 title_list = soap.find('div', {"class": "t_list6"})
     8 soap_t = BeautifulSoup(str(title_list), 'lxml')
     9 href_list = soap_t.find_all('a')
    10 for href in href_list:
    11     soap_h = BeautifulSoup(str(href), 'lxml')
    12     content_url, content_title = soap_h.find('a', {'class': 'nocur'})['href'], soap_h.find('a').text  # 获取章节名称和url
    13     content_response = requests.get(content_url).text
    14     soap_r = BeautifulSoup(content_response, 'lxml')
    15     content_p = soap_r.find_all('div', {'id': 'chaptercontent'})  # 多属性确定内容位置
    16     for content in content_p:
    17         soap_c = BeautifulSoup(str(content), 'lxml')
    18         # print(soap_c.find('p'))
    19         if not os.path.exists(r"E:python_project1dayooks"):  # 创建存放目录
    20             os.makedirs(r'../books/')
    21         with open(r'../books/' + content_title + '.txt', 'w+',encoding='utf8') as f:
    22             f.writelines(soap_c.find('p').text.split("<br/><br/>"))  # 写入章节内容
    23         print('%s has already download' %content_title)
    View Code

    爬取个人博客:

     1 import re, os, requests
     2 
     3 
     4 def get_url(url, pattern=None):
     5     try:
     6         response = requests.get(url)
     7         section_info = []
     8         if response.status_code == 200:
     9             if pattern is not None:
    10                 p_section = re.compile(pattern, re.S)  # r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>'
    11                 section_info = re.findall(p_section, response.text)
    12             return section_info
    13         else:
    14             return None
    15     except Exception as e:
    16         return e
    17 
    18 def write_content(content,section):
    19     if not os.path.exists(r'../blogs/'):
    20         os.makedirs(r'../blogs/')
    21     try:
    22         with open(r'../blogs/'+ section.strip().replace('.','_').replace('——','_').replace(' ','') + '.txt','w+') as f:
    23             f.write(str(content).strip(']['))
    24             print('%s 下载完成……' %section)
    25     except IOError:
    26         print('%s 下载失败……' %section)
    27 
    28 def get_content(section_list, pattern):
    29     for section in section_list:
    30         content_list = get_url(section[0], pattern)
    31         content = []
    32         for p in content_list:
    33             p = p.strip()
    34             content.append(p)
    35         write_content(content,section[1])
    36 
    37 
    38 if __name__ == '__main__':
    39     url = 'https://www.cnblogs.com/wjlv/'
    40     section_list = get_url(url, r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>')
    41     get_content(section_list, r'<p>(.*?)</p>')
    View Code
  • 相关阅读:
    how to pass a Javabean to server In Model2 architecture.
    What is the Web Appliation Archive, abbreviation is "WAR"
    Understaning Javascript OO
    Genetic Fraud
    poj 3211 Washing Clothes
    poj 2385 Apple Catching
    Magic Star
    关于memset的用法几点
    c++ 函数
    zoj 2972 Hurdles of 110m
  • 原文地址:https://www.cnblogs.com/wjlv/p/11413542.html
Copyright © 2011-2022 走看看