zoukankan      html  css  js  c++  java
  • python应用-爬取猫眼电影top100

    使用requests库和正则表达式爬取猫眼电影前100

    import requests
    import re
    import json
    import time
    from requests.exceptions import RequestException
    
    
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
            }
            response = requests.get(url,headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    
    def parse_one_page(html):
        pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a'
                             + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                             + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
        items = re.findall(pattern,html)
        for item in items:
            yield {
                'index':item[0],
                'image':item[1],
                'title':item[2].strip(),
                'actor':item[3].strip()[3:] if len(item[3]) > 3 else '',
                'time':item[4].strip()[5:] if len(item[4]) > 5 else '',
                'score':item[5].strip() + item[6].strip()
            }
    
    def write_to_file(content):
        with open('result.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(content,ensure_ascii=False)+'
    ')
    def main(offset):
        url = 'https://maoyan.com/board/4?offset=' + str(offset)
        html = get_one_page(url)
        # print(html)
        for item in parse_one_page(html):
            print(item)
            write_to_file(item)
    if __name__ == '__main__':
        for i in range(10):
            main(offset=i*10)
            time.sleep(1)
    

    使用requests库和Beautifulsoup库爬去猫眼电影前100

    import requests
    from bs4 import BeautifulSoup
    
    
    def gethtmlpage(url):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    
    
    def parsehtmlpage(html):
        soup = BeautifulSoup(html, 'lxml')
        a = soup.select('.movie-item-info a')
        return a
    
    
    def write_to_file(content):
        with open('result.txt', 'a', encoding="utf-8") as f:
            f.writelines(content + '
    ')
    
    
    def main(url):
        html = gethtmlpage(url)
        title = parsehtmlpage(html)
        for i in range(0, len(title)):
            write_to_file(title[i].string)
    
    
    if __name__ == '__main__':
        for i in range(10,100,10):
            url = "https://maoyan.com/board/4?offset=%d" % i
            main(url)
    

    使用Beautiful库和requests库爬去:

    import requests
    from bs4 import BeautifulSoup
    import bs4
    
    
    def gethtmlpage(url):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except ConnectionError:
            return "网络链接出错"
        except:
            return "未知错误"
    
    
    def parsehtmlpage(html):
        soup = BeautifulSoup(html, 'lxml')
        ol = soup.select("ol.grid_view")
        li = ol[0].select('li')
        movie=[]
        for i in range(0, len(li)):
            index = li[i].select(".pic em")[0].string
            title = li[i].find("span", attrs={'class', 'title'}).string
            rating_num = li[i].find("span", attrs={'class', 'rating_num'}).string
            lianjie = li[i].select(".hd a")[0].get('href')
            if isinstance(li[i].find("span", attrs={'class', 'inq'}), bs4.element.Tag):
                inq = li[i].find("span", attrs={'class', 'inq'}).string
            else:
                inq = "没有简介"
            movie.append([index, title, rating_num, lianjie, inq])
        return movie
    
    
    def writetofile(content):
        with open('result.csv', 'a', encoding='utf-8') as f:
            f.write(content)
    
    
    def main(url):
        html = gethtmlpage(url)
        movie = parsehtmlpage(html)
        for i in range(0, len(movie)):
            writetofile("{0:^5}	{1:{5}^10}	{2:^10}	{3:^40}	{4:<10}
    ".format(movie[i][0], movie[i][1], movie[i][2], movie[i][3], movie[i][4], chr(12288)))
    
    
    if __name__ == '__main__':
        writetofile("{0:^5}	{1:{5}^10}	{2:^8}	{3:^40}	{4:<10}
    ".format("排名", "电影名", "评分", "链接", "一句话介绍电影", chr(12288)))
        for i in range(0, 10):
            url = "https://movie.douban.com/top250?start={}".format(i*25)
            main(url)
    
  • 相关阅读:
    Lua实现的专栏
    Lua C#
    windows server 开机自动登录并锁定
    关于SQLSERVER2012版本远程登录问题
    处理程序“PageHandlerFactory-Integrated”在其模块列表中有一个错误模块“ManagedPipelineHandler”
    SQLServer安装正常服务启动正常却无法连接
    C#程序猿电脑重装记录
    installshield使用教程
    macbook安装win7
    mvc 开发razor模式注意事项
  • 原文地址:https://www.cnblogs.com/v01cano/p/10792667.html
Copyright © 2011-2022 走看看