zoukankan      html  css  js  c++  java
  • 爬取个人随笔内容——练手,待补充

    import requests,lxml
    from bs4 import BeautifulSoup
    url='https://www.cnblogs.com/wjlv/default.html?page=2' # 打开网页
    html_index = requests.get(url).text # 获取请求内容
    soap = BeautifulSoup(html_index,"lxml")
    a_list = soap.find_all('a',{"class":"postTitle2"}) # 得到所有随笔标题和地址标签
    for h in a_list:
    soap = BeautifulSoup(str(h),'lxml')
    # print('{}:{}'.format(soap.find('a').text,soap.find('a')['href'])) # 得到单个随笔的地址
    article = requests.get(soap.find('a')['href']).text # 得到单个随笔的内容
    soap_a = BeautifulSoup(article,'lxml')
    p_lab = soap_a.find_all('p') # 获取单个随笔内容标签
    for txt in p_lab:
    soap_t = BeautifulSoup(str(txt),'lxml')
    print(soap_t.find('p').text) # 得到所有标签的具体内容
     1 import lxml, requests, os
     2 from bs4 import BeautifulSoup
     4 url = 'http://book.txtbook.com.cn/shu/6478/chapterlist.html'
     5 response = requests.get(url).text  # 请求小说章节总览页
     6 soap = BeautifulSoup(response, 'lxml')
     7 title_list = soap.find('div', {"class": "t_list6"})
     8 soap_t = BeautifulSoup(str(title_list), 'lxml')
     9 href_list = soap_t.find_all('a')
    10 for href in href_list:
    11     soap_h = BeautifulSoup(str(href), 'lxml')
    12     content_url, content_title = soap_h.find('a', {'class': 'nocur'})['href'], soap_h.find('a').text  # 获取章节名称和url
    13     content_response = requests.get(content_url).text
    14     soap_r = BeautifulSoup(content_response, 'lxml')
    15     content_p = soap_r.find_all('div', {'id': 'chaptercontent'})  # 多属性确定内容位置
    16     for content in content_p:
    17         soap_c = BeautifulSoup(str(content), 'lxml')
    18         # print(soap_c.find('p'))
    19         if not os.path.exists(r"E:python_project1dayooks"):  # 创建存放目录
    20             os.makedirs(r'../books/')
    21         with open(r'../books/' + content_title + '.txt', 'w+',encoding='utf8') as f:
    22             f.writelines(soap_c.find('p').text.split("<br/><br/>"))  # 写入章节内容
    23         print('%s has already download' %content_title)
    View Code


     1 import re, os, requests
     4 def get_url(url, pattern=None):
     5     try:
     6         response = requests.get(url)
     7         section_info = []
     8         if response.status_code == 200:
     9             if pattern is not None:
    10                 p_section = re.compile(pattern, re.S)  # r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>'
    11                 section_info = re.findall(p_section, response.text)
    12             return section_info
    13         else:
    14             return None
    15     except Exception as e:
    16         return e
    18 def write_content(content,section):
    19     if not os.path.exists(r'../blogs/'):
    20         os.makedirs(r'../blogs/')
    21     try:
    22         with open(r'../blogs/'+ section.strip().replace('.','_').replace('——','_').replace(' ','') + '.txt','w+') as f:
    23             f.write(str(content).strip(']['))
    24             print('%s 下载完成……' %section)
    25     except IOError:
    26         print('%s 下载失败……' %section)
    28 def get_content(section_list, pattern):
    29     for section in section_list:
    30         content_list = get_url(section[0], pattern)
    31         content = []
    32         for p in content_list:
    33             p = p.strip()
    34             content.append(p)
    35         write_content(content,section[1])
    38 if __name__ == '__main__':
    39     url = 'https://www.cnblogs.com/wjlv/'
    40     section_list = get_url(url, r'class="postTitle2[^.]*?ef="(.*?)">(.*?)</a>')
    41     get_content(section_list, r'<p>(.*?)</p>')
    View Code
  • 相关阅读:
    ios 数据类型转换 UIImage转换为NSData NSData转换为NSString
    iOS UI 12 block传值
    iOS UI 11 单例
    iOS UI 08 uitableview 自定义cell
    iOS UI 07 uitableviewi3
    iOS UI 07 uitableviewi2
    iOS UI 07 uitableview
    iOS UI 05 传值
    iOS UI 04 轨道和动画
    iOS UI 03 事件和手势
  • 原文地址:https://www.cnblogs.com/wjlv/p/11413542.html
Copyright © 2011-2022 走看看