zoukankan      html  css  js  c++  java
  • 豆瓣爬虫

    豆瓣爬虫

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    from sklearn.linear_model import LinearRegression
    import seaborn as sns
    import numpy as np
    import matplotlib.pyplot as plt
    import matplotlib
    from scipy.optimize import leastsq
    
    def get_html(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'}#伪装爬虫
        resp = requests.get(url, headers = headers)
        return resp.text
    
    url = 'https://movie.douban.com/top250'
    html = get_html(url)
    soup = BeautifulSoup(html, 'html.parser')
    
    
    a = soup.find_all('div', class_='hd')
    #电影名
    film_name = []
    for i in a:
        film_name.append(i.a.span.text)
    
    #评分
    rating_score = soup.find_all('span', class_='rating_num')
    
    lt = []
    num = 20
    for i in range(num):
        lt.append([i+1,film_name[i], rating_score[i].string])
        df=pd.DataFrame(lt,columns = ['排名', '电影名', '评分'])
    df.to_csv(r'C:UsersadmirDesktop参考豆瓣电影数据.csv') #保存文件,数据持久化

    根据网页格式调整实现批量输出

    import json  
    import requests  
    from requests.exceptions import RequestException  
    import re  
    import time 
    
    def get_one_page(url):  
        try:  
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'  
            }        #网络 html发起者  请求  消息头
            response = requests.get(url, headers=headers)  
            if response.status_code == 200:  
                return response.text  
            return None  
        except RequestException:  
            return None  
    def parse_one_page(html):  
       pattern = re.compile('<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',re.S)  
       items = re.findall(pattern, html)  
       for item in items:  
            yield {'index': item[0],  
                'title': item[1],  
                'score': item[2],
                'comment':item[3]
            }  
    def write_to_file(content):  
        with open(r'C:UsersadmirDesktop参考douban250.txt', 'a', encoding='utf-8') as f:  
    #写入txt文件;如果需要输出csv文件直接修改后缀即可 f.write(json.dumps(content, ensure_ascii
    =False) + ' ') def main(offset): url = 'https://movie.douban.com/top250?start='+str(offset)+'&filter=' html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i * 25) time.sleep(1)
  • 相关阅读:
    银行卡和手机号占位符
    防京东进度尺的金额
    圆的进度条
    HMTL5滑动块研究
    自动生成验证码
    HTML5语义化
    (转)C++中使用C代码
    (转)四旋翼飞行器基本知识
    如何将.jpg图片 转换成.eps 格式图片
    HDOJ 1196 Lowest Bit
  • 原文地址:https://www.cnblogs.com/celine227/p/14473221.html
Copyright © 2011-2022 走看看