zoukankan      html  css  js  c++  java
  • Python-微信小程序信息的爬取

    import requests
    import csv
    from lxml import html
    from bs4 import BeautifulSoup
    Header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3928.4 Safari/537.36'
    }
    def getdata(url):
        resp=requests.get(url)
        ht=resp.text
        soup=BeautifulSoup(ht,"lxml")
        title=soup.find('h1',class_='ph')
        print(title.text)
        p = soup.find('div', class_='blockquote')
        print(p.text)
        time=soup.find('span',class_='time')
        print(time.text)
        #保存在csv文件当中
        with open("微信小程序.csv", "a", newline="") as cf:
            w = csv.writer(cf)
            w.writerow([title.text, p.text, time.text])
            cf.close()
    
    def parse_page(url):
        resp=requests.get(url,headers=Header)
        resp.encoding = resp.apparent_encoding
        temp = resp.text
        ht = html.fromstring(temp)
        informations = ht.xpath('//*[@id="itemContainer"]/div/div/h3/a')
        #抓取在该网页下的超链接
        for inf in informations:
            url2 = "http://www.wxapp-union.com/" + inf.get('href')
            getdata(url2)
        print('微信小程序全部爬取完成')
    def began():
        #小程序页面共计有107页
        url = "http://www.wxapp-union.com/portal.php?mod=list&catid=1&page={}"
        for i in range(1,108):
            new_url=url.format(i)
            parse_page(new_url)
    if __name__ == '__main__':
        began()
  • 相关阅读:
    ACM-ICPC 2018 南京赛区网络预赛 J.Sum
    汉诺塔
    汉诺塔
    D
    D
    数学小定理
    数学小定理
    Python index()方法
    Python endswith()方法
    Python encode()方法
  • 原文地址:https://www.cnblogs.com/Angfe/p/13035971.html
Copyright © 2011-2022 走看看