zoukankan      html  css  js  c++  java
  • python 爬虫练习

    bs去除特定标签。

    # url
    import easygui as g
    import urllib.request
    from bs4 import BeautifulSoup
    import os
    import sys
    import re
    import config.story2 as urls
    
    # 获取url
    def set_url():
    
        msg = "请填写一下信息(其中带*号的项为必填项)"
        title = "爬虫练习"
        fieldNames = ["*小说目录地址", "*组装前半段", "后半段"]
        fieldValues = []
        fieldValues = g.multenterbox(msg, title, fieldNames)
        while True:
            if fieldValues == None:
                break
            errmsg = ""
            for i in range(len(fieldNames)):
                option = fieldNames[i].strip()
                if fieldValues[i].strip() == "" and option[0] == "*":
                    errmsg += ("【%s】为必填项   " % fieldNames[i])
            if errmsg == "":
                break
            fieldValues = g.multenterbox(errmsg, title, fieldNames, fieldValues)
    
        return fieldValues
    
    
    # 下载网页内容,找到文章标题和对应的下载路径
    def get_urls(seed_url,pre_url,last_url):
        # 保存文章名称和地址
        storyList = {}
        response = urllib.request.urlopen(seed_url)
        html = response.read().decode('utf-8')
        bs = BeautifulSoup(html, "html.parser")
        contents = bs.find_all("div", {"class": "c-line-bottom"})
        for each in contents:
            # 或者文章的data-nsrc属性
            nsrc = each.a["data-nsrc"]
            #组装url
            seed_url = pre_url+nsrc+last_url
            # 获取文件标题
            title = each.p.string
            storyList[title] = seed_url
    
        return storyList
    
    # 获取每个小说并下载
    def getStory():
        savepath = r"E:\stories\"
        storyList = get_urls(urls.url1,urls.url2,urls.url3)
        storyNames = list(storyList.keys())
        for i in range(len(storyNames)):
            # 获取小说:
            html = urllib.request.urlopen(storyList[storyNames[i]]).read().decode('utf-8')
            bs = BeautifulSoup(html,"html.parser")
    
            [s.extract() for s in bs('br')]   # 后来发现这个可以啊
            content = bs.find_all('p')
            #[ss.extract() for ss in content('p')]  # 放到这里是否可以,发现不行。TypeError: 'ResultSet' object is not callable
            # # 用替换方式去掉br修饰,发现不行
            # oldstr = r'<br style="font-size:16px;font-weight:normal;' 
            #          r'margin-left:4px;margin-right:4px;float:none;color:rgb(0, 0, 0);' 
            #          r'text-align:-webkit-auto;text-indent:0px;white-space:normal;' 
            #          r'text-overflow:clip;clear:none;display:inline;"/>'
            #
    
           # print(content)
    
            with open(savepath+storyNames[i]+".txt",'w') as f:
                 f.writelines(str(content))
    
    # download(get_url())
    # get_url()
    getStory()
  • 相关阅读:
    VIM中保存编辑的只读文件
    ElasticSearch 入门
    NET Core实现OAuth2.0的ResourceOwnerPassword和ClientCredentials模式
    visual studio for mac的安装初体验
    分库分表
    Oracle执行计划
    mybatis批量update(mysql)
    查看mysql字符集及修改表结构--表字符集,字段字符集
    13.1.17 CREATE TABLE Syntax
    10.1.5 Connection Character Sets and Collations
  • 原文地址:https://www.cnblogs.com/scios/p/8834609.html
Copyright © 2011-2022 走看看