zoukankan      html  css  js  c++  java
  • crawl wechat page

    #!/usr/bin/python
    # coding: utf-8
    import re
    from collections import Counter
    
    import requests
    import time
    from bs4 import BeautifulSoup
    
    
    def count_zero(text):
        zero = dict(Counter(text)).get('0', 0)
        if zero > 1:
            return zero
        else:
            return False
    
    
    def get_normal_title(text):
        start_index = text[:3]
    
        titles = re.split(pattern='ddd', string=text, maxsplit=20, flags=1)
    
        titles_rm_quesiton = [item.replace('?', ' ') for item in titles]
        titles_rm_blank = [t for t in titles_rm_quesiton if t]
    
        titles_normal = []
        for index, i in enumerate(titles_rm_blank):
            if 1 == len(str(int(start_index))):
                t = '00' + str(int(start_index) + index) + i
            elif str(start_index).startswith('0'):
                t = '0' + str(int(start_index) + index) + i
            else:
                t = str(int(start_index) + index) + i
    
            titles_normal.append(t)
    
        return titles_normal
    
    
    def eliminate_question(title):
        return str(title).replace('xa0', '')
    
    
    def get_title_url(response):
        title_url_dict = {}
        soup = BeautifulSoup(response, 'html.parser')
        tag_p = soup.find_all('p')
        for each_p in tag_p:
            urls = []
            text = each_p.get_text()
            pattern = re.compile('^d{3}.*$')
            if pattern.match(text):
                zero_num = count_zero(text)
                if zero_num:
                    titles = get_normal_title(text)
                    for each_a in each_p.find_all('a'):
                        url = each_a.get('href')
                        urls.append(url)
                    title_url_tuple = zip([eliminate_question(t) for t in titles], urls)
                    for i in title_url_tuple:
                        title_url_dict.setdefault(i[0], i[1])
                else:
                    text = eliminate_question(text)
                    url = each_p.find('a').get('href')
                    title_url_dict.setdefault(text, url)
    
        return title_url_dict
    
    
    def download_content(url, title):
        response = requests.get(url=url).text
    
        with open(title + '.html', 'w', encoding='utf-8') as f:
            f.write(response)
    
    
    def main():
        url_wechat_index = 'https://mp.weixin.qq.com/s/7o8QxGydMTUe4Q7Tz46Diw'
        response = requests.get(url=url_wechat_index).text
        title_url_dict = get_title_url(response)
        for title, url in title_url_dict.items():
            time.sleep(5)
            download_content(url, title)
    
    
    if __name__ == "__main__":
        main()
    
  • 相关阅读:
    css选择器中:first-child与:first-of-type的区别
    Chrome 快捷键
    notepad++ html格式化
    Linux VFS的主要的数据结构
    Linux根文件系统介绍
    Linux文件系统测试工具
    p​o​s​t​m​a​r​k​使​用
    虚拟文件系统
    linux文件系统初始化过程(6)---执行init程序
    linux文件系统初始化过程(4)---加载initrd(中)
  • 原文地址:https://www.cnblogs.com/otfsenter/p/9548477.html
Copyright © 2011-2022 走看看