zoukankan html css js c++ java

crawl wechat page

#!/usr/bin/python
# coding: utf-8
import re
from collections import Counter

import requests
import time
from bs4 import BeautifulSoup


def count_zero(text):
    zero = dict(Counter(text)).get('0', 0)
    if zero > 1:
        return zero
    else:
        return False


def get_normal_title(text):
    start_index = text[:3]

    titles = re.split(pattern='ddd', string=text, maxsplit=20, flags=1)

    titles_rm_quesiton = [item.replace('?', ' ') for item in titles]
    titles_rm_blank = [t for t in titles_rm_quesiton if t]

    titles_normal = []
    for index, i in enumerate(titles_rm_blank):
        if 1 == len(str(int(start_index))):
            t = '00' + str(int(start_index) + index) + i
        elif str(start_index).startswith('0'):
            t = '0' + str(int(start_index) + index) + i
        else:
            t = str(int(start_index) + index) + i

        titles_normal.append(t)

    return titles_normal


def eliminate_question(title):
    return str(title).replace('xa0', '')


def get_title_url(response):
    title_url_dict = {}
    soup = BeautifulSoup(response, 'html.parser')
    tag_p = soup.find_all('p')
    for each_p in tag_p:
        urls = []
        text = each_p.get_text()
        pattern = re.compile('^d{3}.*$')
        if pattern.match(text):
            zero_num = count_zero(text)
            if zero_num:
                titles = get_normal_title(text)
                for each_a in each_p.find_all('a'):
                    url = each_a.get('href')
                    urls.append(url)
                title_url_tuple = zip([eliminate_question(t) for t in titles], urls)
                for i in title_url_tuple:
                    title_url_dict.setdefault(i[0], i[1])
            else:
                text = eliminate_question(text)
                url = each_p.find('a').get('href')
                title_url_dict.setdefault(text, url)

    return title_url_dict


def download_content(url, title):
    response = requests.get(url=url).text

    with open(title + '.html', 'w', encoding='utf-8') as f:
        f.write(response)


def main():
    url_wechat_index = 'https://mp.weixin.qq.com/s/7o8QxGydMTUe4Q7Tz46Diw'
    response = requests.get(url=url_wechat_index).text
    title_url_dict = get_title_url(response)
    for title, url in title_url_dict.items():
        time.sleep(5)
        download_content(url, title)


if __name__ == "__main__":
    main()

查看全文

相关阅读:
使用国内镜像源安装kubelet kubeadm kubectl
查看pod创建时使用yaml文件内容
 东离剑游记第三季下载地址
 Prometheus监控Nginx
使用yum方式安装的openresty参数
 用prometheus监控Nginx
MySQL之pt-query-digest分析慢查询日志的详情介绍
 内容渠道的贬值---阮一峰的网络日志
 Prometheus中使用的告警规则
 frpc穿透报错日志显示 http: proxy error: no such domain 解决办法

原文地址：https://www.cnblogs.com/otfsenter/p/9548477.html