zoukankan      html  css  js  c++  java
  • python3爬虫再探之豆瓣影评数据抓取

        一个关于豆瓣影评的爬虫,涉及:模拟登陆,翻页抓取。直接上代码:

    import re
    import time
    import requests
    import xlsxwriter
    from bs4 import BeautifulSoup
    
    
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36',
               'Referer':'https://www.douban.com/accounts/login?source=movie'}
    s = requests.Session()
    def log_in(login_url):
        # 获取验证码并保存到本地
        imgdata = s.get("https://www.douban.com/accounts/login?source=movie", headers=headers, verify=False).text
        print(imgdata)
        pa = re.compile(r'<img id="captcha_image" src="(.*?)" alt="captcha" class="captcha_image"/>')
        img_url = re.findall(pa, imgdata)[0]
        print(img_url)
        picdata = s.get(img_url).content
        with open("douban.jpg", 'wb') as f:
            f.write(picdata)
    
        # 获取随机ID
        pa_id = re.compile(r'<input type="hidden" name="captcha-id" value="(.*?)"/>')
        capid = re.findall(pa_id, imgdata)[0]
        print(capid)
    
    
        capimg = input("输入验证码:")
    
        payload = {
        "source":"movie",
        "redir":"https://movie.douban.com/",
        "form_email":"你的邮箱",
        "form_password":"你的密码",
        "captcha-solution":capimg,
        "captcha-id":capid,
        "login":"登录"
        }
    
        # log_url = "https://accounts.douban.com/login"
        data1 = s.post(login_url, data=payload, verify=False)  # 绕过了SSL验证
        print(data1.status_code)
    
    
    i = 0
    def get_data(url):
        time.sleep(2)
        print("#"*50)
        global i
        print(i)
        try:
            data = s.get(url, headers = headers).text
            print(data)
        except:
            try:
                time.sleep(3)
                print("正在尝试重新加载页面...")
                data = s.get(url, headers= headers).text
            except:
                workbook.close()
                pass
        # print(data)
    
        # 解析网页
        soup = BeautifulSoup(data, "lxml")
        comments = soup.findAll("div", {"class":"comment-item"})
    
        # print(len(comments))
        for comment in comments:
            i += 1
            info = comment.find("span",{"class":"comment-info"})
    
            # get date
            date = info.find("span",{"class":""}).get_text()
            pa_date = re.compile("dddd-dd-dd")
            date = re.findall(pa_date, date)[0]
            # print(date)
            worksheet.write(i,0,date)
    
            # get star
            star = info.find("span")["class"][0][-2:-1]
            # print(star)
            worksheet.write(i,1,star)
    
            # get vote
            vote = comment.find("span", {"class":"comment-vote"}).find("span").get_text()
            # print(vote)
            worksheet.write(i,2,vote)
    
            # get content
            content = comment.find("div", {"class":"comment"}).find("p").get_text()
            print(content)
            worksheet.write(i,3,content)
    
        # 获取下一页的url,递归抓取
        pa = re.compile('<a href="?(.*?)" .*? class="next">后一页</a>')
        try:
            next = str(pa.findall(data)[0]).replace("amp;","")
            next_url = "https://movie.douban.com/subject/25958717/comments" + next
            print("正在抓取"+next_url+"...")
            get_data(next_url)
        except:
            workbook.close()
            pass
    
    
    workbook = xlsxwriter.Workbook('海蒂和爷爷影评.xlsx')
    worksheet = workbook.add_worksheet()
    worksheet.set_column('A:A', 20)
    worksheet.set_column('B:B', 10)
    worksheet.set_column('C:C', 10)
    worksheet.set_column('D:D', 500)
    
    login_url = "https://accounts.douban.com/login"
    log_in(login_url)
    comment_data = get_data("https://movie.douban.com/subject/25958717/comments")
    workbook.close()
    

        这里有两个问题:

      1.首先,登陆的时候,可能会不需要验证码(当然也不会抓到验证码的图片。。),加上try就可以了。

      2.数据抓取不全。。。总是剩下1/5左右的数据抓不到,,目前还未解决,请看到的大神指点!

  • 相关阅读:
    Linux异常现场--pt_regs浅析
    内核调试--确认结构体的size和结构体成员的偏移
    Linux内核中root_domain的建立与初始化
    solr学习笔记-全量更新与增量更新,去除html标签
    oracle 替换clob里面的某个特定的字符串
    oracle重置序列从1开始
    oracle提取汉字拼音首字母
    layer.open的yes函数中获取弹出层(子集iframe)中的元素或参数
    java.util.Date日期时间工具类
    js操作将数字转换成 , 逗号分割的字符串并追加‘万’字
  • 原文地址:https://www.cnblogs.com/buzhizhitong/p/5813286.html
Copyright © 2011-2022 走看看