zoukankan      html  css  js  c++  java
  • 输入一个网址提取文字代码

    import requests
    from bs4 import BeautifulSoup
    import re
    
    string = ''
    # 确定目标网页
    url = 'http://imgtest.yijingjia.com/95f5aa57ab5a4a828a4aa4007587ef5a_1631190202380'
    print(url)
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
    headers = {'User-Agent': user_agent}
    res = requests.get(url=url, headers=headers)
    res.encoding = res.apparent_encoding
    soup = BeautifulSoup(res.text, 'lxml')
    news = soup.find('html')  # 只识别外层的div里面的文字
    string = string + news.get_text()
    
    # 1.处理空行----------------------------------------success
    string = re.sub('\n{1,100}', '\n', string)
    # 2.处理只含有空格的行--------------------------------success
    string = re.sub('\40{0,100}\n\40{0,100}\n', '\n', string)
    string = re.sub('\40{0,100}\n\40{0,100}', '\n', string)
    
    # 3.将所有中文冒号转成英文冒号--------------------------success
    string = re.sub(':', ':', string)
    
    # 4.处理冒号换行空格----------------------------------success
    string = re.sub('\40{0,100}:\40{0,100}\n\40{0,100}', ':', string)
    # 5.处理冒号后面的空格--------------------------------default
    string = re.sub(': {1,100}', ':', string)
    # 6.处理空格----------------------------------------default  怀疑读取出来的不是空格,因为 2. 是可以处理空格的
    string = re.sub('\40{2,100}', '\40', string)
    
    fileName = 'E:\\网站文件\\' + 'test.txt'
    print(fileName)
    fh = open(fileName, 'w', encoding='utf-8')
    fh.write(string)
    fh.close()
    
    
    
    会当凌绝顶,一览众山小
  • 相关阅读:
    Luogu4655 [CEOI2017]Building Bridges
    bzoj4892 [TJOI2017]DNA
    Luogu5058 [ZJOI2004]嗅探器
    bzoj4373 算术天才⑨与等差数列
    bzoj3122 [SDOI2013]随机数生成器
    CF940F Machine Learning
    bzoj1935 [SHOI2007]Tree 园丁的烦恼
    CF1012B Chemical table
    CF1012A Photo of The Sky
    bzoj4850 [JSOI2016]灯塔
  • 原文地址:https://www.cnblogs.com/leyzzz/p/15726272.html
Copyright © 2011-2022 走看看