zoukankan      html  css  js  c++  java
  • 猫眼100 爬虫

    正则表达式爬虫

    完整代码

    import requests  # 获取网页数据
    import re  # 正则表达式
    from bs4 import BeautifulSoup  # 网页解析,获取数据
    import xlwt  # 保存为excel
    
    findIndex = re.compile(r'board-index.*?>(d+).*?')
    findImage = re.compile(r'class="board-img".*?src="(.*?)"')
    findTitle = re.compile(r'title="(.*?)">')
    findActor = re.compile(r'class="star">(.|
    )(.*)')
    findTime = re.compile(r'class="releasetime">(.*?)</p> ')
    findScore1 = re.compile(r'class="integer">(.*?)</i>')
    findScore2 = re.compile(r'class="fraction">(.*?)</i>')
    
    
    # 爬取网页
    # 解析数据
    # 保存数据
    
    def main():
        baseurl = "https://maoyan.com/board/4?offset="
        datalist = getData(baseurl)
        savepath = "猫眼TOP100.xls"
        saveData(datalist, savepath)
    
    
    def getData(baseUrl):
        datalist = []
        for i in range(0, 10):
            url = baseUrl + str(i * 10)
            html = askUrl(url)
            # 解析数据
            soup = BeautifulSoup(html, "html.parser")
    
            for item in soup.find_all("dd"):
                # print(item)  # 测试
                data = []
                item = str(item)
                # 排名
                index = re.findall(findIndex, item)[0]
                data.append(index)
                # 图片地址
                image = re.findall(findImage, item)[0]
                data.append(image)
                # 标题
                title = re.findall(findTitle, item)[0]
                data.append(title)
                # 作者
                actor = re.findall(findActor, item)[0]
                actorList = list(actor)
                for i in actorList:
                    actorNew = "".join(i).strip()
                data.append(actorNew)
                # 上映时间
                time = re.findall(findTime, item)[0]
                data.append(time)
                # 成绩
                score1 = re.findall(findScore1, item)[0]
                # data.append(score1)
                score2 = re.findall(findScore2, item)[0]
                # data.append(score2)
                score = score1 + score2
                data.append(score)
    
                # print(data)
                datalist.append(data)
        #print(datalist)
        return datalist
    
    
    # 爬取网页
    def askUrl(url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4209.2 Safari/537.36"}
    
        html = ""
        try:
            response = requests.get(url, headers=headers)
            html = response.content.decode("utf-8")
            # print(html)
        except requests.exceptions as e:
            if hasattr(e, "code"):
                print(e.code)
            if hasattr(e, "reason"):
                print(e.reason)
    
        return html
    
    
    def saveData(datalist, savepath):
        book = xlwt.Workbook(encoding="utf-8")
        sheet = book.add_sheet("猫眼TOP100", cell_overwrite_ok=True)
        col = ("电影排名", "图片地址", "电影名称", "演出人员", "上映时间", "电影评分")
        for i in range(0, 6):
            sheet.write(0, i, col[i])
        for i in range(0, 100):
            print("第%d条" % (i + 1))
            try:
                data = datalist[i]
            except:
                continue
            for j in range(0, 6):
                sheet.write(i + 1, j, data[j])
        book.save(savepath)
    
    
    if __name__ == '__main__':  # 当程序执行时调用函数
        main()
        print('爬取完成')
    
    
  • 相关阅读:
    linux socat创建简单的tun隧道
    【k8s】sc-nfs-pod
    c#中equals和==
    数据结构之哈希表
    数据结构之红黑树
    数据结构之2-3查找树
    数据结构之二叉查找树
    数据结构之递归与栈
    数据结构之二分查找法(折半查找)
    数据结构之基于无序链表的集合和映射
  • 原文地址:https://www.cnblogs.com/Lin1031/p/13702963.html
Copyright © 2011-2022 走看看