zoukankan      html  css  js  c++  java
  • 评论抓取:Python爬取微信在APPStore上的评论内容及星级

    #完整程序如下:
    import requests
    import re
    
    def getHTMLText(url):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ''
    
    def printAPPName(html):
        try:
            pattern = re.compile(r'{"im:name":{"label":(.*?)}, "rights"', re.S)
            #如果不使用re.S参数,则只在每一行内进行匹配,如果一行没有,就换下一行重新开始,不会跨行。
            #而使用re.S参数以后,正则表达式会将这个字符串作为一个整体,将“
    ”当做一个普通的字符加入到这个字符串中,在整体中进行匹配
            APPName = re.findall(pattern, str(html))
            return 'APPName:' + str(APPName)
        except:
            return ''
    
    def fillUnivlist(titles, comments, stars, html):
        try:
            pattern = re.compile(r'"title":{"label":(.*?)}, "content"', re.S) #提取标题
            nbaInfo = re.findall(pattern, str(html)) #提取title
    
            # findStr = '"title":{"label":'
            # nbaInfo = nbaInfo1[nbaInfo1.find(findStr)+len(findStr):]
            patternFloor = re.compile(r'"content":{"label":(.*?), "attributes":{"type":"text"}}', re.S) #提取content
            floorText = re.findall(patternFloor, str(html))
    
            patternStar = re.compile(r'"im:rating":{"label":(.*?)}, "id"', re.S)  # 提取星级
            star = re.findall(patternStar, str(html))
            # print(str(star))
    
            number = len(nbaInfo)
            print(number)
            for i in range(number):
                Info = nbaInfo[i] #利用Tools类移除不想要的格式字符
                if i==0:Info = Info[Info.find('"title":{"label":')+len('"title":{"label":'):]
                # print(Info)
                Info1 = floorText[i]
                Info2 = star[i]
                # print(Info2+"hello")
                titles.append('title:' + Info)
                comments.append('content:' + Info1)
                stars.append('star:' + Info2)
        except:
            return ''
    
    def writeText(titleText, fpath):
        try:
            with open(fpath, 'a', encoding='utf-8') as f:
                f.write(str(titleText)+'
    ')
                f.write('
    ')
                f.close()
        except:
            return ''
    
    def writeUnivlist(titles, comments, stars, fpath, num):
        with open(fpath, 'a', encoding='utf-8') as f:
            for i in range(num):
                f.write(str(stars[i]) + '
    ')
                f.write('*' * 10 + '
    ')
                f.write(str(titles[i]) + '
    ')
                f.write('*' * 50 + '
    ') #输入一行*号
                f.write(str(comments[i]) + '
    ')
                f.write('*' * 100 + '
    ')
            f.close()
    
    def main():
        count = 0
        url = 'https://itunes.apple.com/rss/customerreviews/page=1/id=414478124/sortby=mostrecent/json?l=en&&cc=cn' #要访问的网址
        output_file = 'D:/StockInfo.txt' #最终文本输出的文件
        html = getHTMLText(url) #获取HTML
        APPName = printAPPName(html)
        writeText(APPName, output_file)
        for i in range(10):
            i = i + 1
            titles = []
            comments = []
            stars = []
            url = 'https://itunes.apple.com/rss/customerreviews/page=' + str(i) + '/id=414478124/sortby=mostrecent/json?l=en&&cc=cn'
            html = getHTMLText(url)
            fillUnivlist(titles, comments, stars, html)
            writeUnivlist(titles, comments, stars, output_file, len(titles))
            count = count + 1
            print("
    当前进度: {:.2f}%".format(count * 100 / 10), end="")
    
    if __name__ == '__main__':
        main()
    
    
    #如果想爬取其他APP只需要改变id的值,如想爬腾讯的,只需将id=414478124换成id=444934666
    #另外本程序是模仿https://www.cnblogs.com/sea-ocean/p/6601421.html的
  • 相关阅读:
    DICOMDIR结构
    给文件夹添加Everyone用户
    关于Predicate<T>委托
    开发者必备的6款源码搜索引擎
    Create XML Files Out Of SQL Server With SSIS And FOR XML Syntax
    create xml file from sql script
    DICOM中的入门概念
    小米note开启调试模式
    [转] Java基础知识——Java语言基础
    Java语言基本语法
  • 原文地址:https://www.cnblogs.com/wu-chao/p/9210416.html
Copyright © 2011-2022 走看看