zoukankan      html  css  js  c++  java
  • python动态网页的爬取

    例子:爬取笔趣阁的小说圣墟

    1.爬取小说章节的URL

    from bs4 import BeautifulSoup
    from selenium import webdriver
    import re

    def book_url():
    chromeOptions = webdriver.ChromeOptions()
    # 设置代理
    chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
    # 一定要注意,=两边不能有空格
    driver = webdriver.Firefox()
    driver.implicitly_wait(30) # 隐性等待,最长等30
    driver.get(r'http://www.xbiquge.la/13/13959/')
    txt = driver.page_source
    soup = BeautifulSoup(txt, 'html.parser')
    url = re.findall('<a href="(.*)">', str(soup.find_all('div', id='list')))
    word = re.findall('<a.*>(.*)</a>', str(soup.find_all('div', id='list')))
    word_dict = dict(zip(list(word), list(url)))
    driver.quit()
    return word_dict

    2.爬取小说前200章的内容并写入txt文本中

    from bs4 import BeautifulSoup
    from selenium import webdriver
    import re
    import codecs
    import crawling.pro_2.py1 as py1


    def url():
    word_dict = py1.book_url()
    word = []
    for i in word_dict.values():
    word.append(i)
    return word


    def book(url):
    chromeOptions = webdriver.ChromeOptions()
    # 设置代理
    chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
    # 一定要注意,=两边不能有空格,不能是这样--proxy-server = http://202.20.16.82:10152
    driver = webdriver.Firefox()
    driver.implicitly_wait(30) # 隐性等待,最长等30
    driver.get('http://www.xbiquge.la/'+url)
    txt = driver.page_source
    soup = BeautifulSoup(txt, 'html.parser')
    a = soup.find_all('div', id='content')
    a = re.sub(r'<div id="content">', '', str(a))
    a = re.sub(r'</p></div>', '', str(a))
    a = re.sub(r'xa0', '', str(a))
    a = re.sub(r'<p><a href=', '', str(a))
    a = re.sub(r'target="_blank">', '', str(a))
    a = re.sub(r'</a>', '', str(a))
    a = str(a)
    line = list(a.split("<br/>"))
    name = re.findall('<h1>(.*)</h1', str(soup.find_all('div', class_='bookname')))
    name = re.sub("'", '', str(name))
    f = codecs.open('小说圣墟.txt', 'a', 'utf-8')
    kong_list = []
    for j in line:
    if j == ' ':
    kong_list.append(j)
    for k in kong_list:
    line.remove(k)
    print(name, end=' ', file=f)
    for i in line:
    text = re.sub(r' ', '', i)
    print(text, file=f)
    driver.quit()


    if __name__ == '__main__':
    url_list = url()
    del url_list[200:]
    for time in url_list:
    book(time)
     
  • 相关阅读:
    分享完整的项目工程目录结构
    2014年糯米网校
    高并发非自增ID如何设计?
    Asp.Net中使用Couchbase——Memcached缓存使用篇
    协作图(通信图)collaboration diagram
    解决java获取系统时间差8个小时 专题
    智言趣语
    Common class for judge IPV6 or IPV4
    判断参数是否符合要求的一个类
    Connection to https://dl-ssl.google.com refused的解决办法
  • 原文地址:https://www.cnblogs.com/lihui123/p/12806955.html
Copyright © 2011-2022 走看看