zoukankan      html  css  js  c++  java
  • Python实现爬取需要登录的网站完整示例

    1 from selenium import webdriver
    2 dirver = webdriver.Firefox()
    3 dirver.get('https://music.douban.com/')
    4 for i in dirver.find_elements_by_css_selector('.new-albums .album-title'):
    5     print(i.text)
    读取页面整合后的结果
     1 import requests
     2 from lxml import html
     3 # 创建 session 对象。这个对象会保存所有的登录会话请求。
     4 session_requests = requests.session()
     5 # 提取在登录时所使用的 csrf 标记
     6 login_url = "https://bitbucket.org/account/signin/?next=/"
     7 result = session_requests.get(login_url)
     8 tree = html.fromstring(result.text)
     9 authenticity_token = list(set(tree.xpath("//input[@name='csrfmiddlewaretoken']/@value")))[0]
    10 payload = {
    11   "username": "<你的用户名>",
    12   "password": "<你的密码>",
    13   "csrfmiddlewaretoken": authenticity_token # 在源代码中,有一个名为 “csrfmiddlewaretoken” 的隐藏输入标签。
    14 }
    15 # 执行登录
    16 result = session_requests.post(
    17   login_url,
    18   data = payload,
    19   headers = dict(referer=login_url)
    20 )
    21 # 已经登录成功了,然后从 bitbucket dashboard 页面上爬取内容。
    22 url = 'https://bitbucket.org/dashboard/overview'
    23 result = session_requests.get(
    24   url,
    25   headers = dict(referer = url)
    26 )
    27 # 测试爬取的内容
    28 tree = html.fromstring(result.content)
    29 bucket_elems = tree.findall(".//span[@class='repo-name']/")
    30 bucket_names = [bucket.text_content.replace("n", "").strip() for bucket in bucket_elems]
    31 print(bucket_names)
    View Code
     1 from bs4 import BeautifulSoup
     2 import requests
     3 
     4 class CSDN(object):
     5     def __init__(self, headers):
     6         self.session = requests.Session()
     7         self.headers = headers
     8     def get_webflow(self):
     9         url = 'http://passport.csdn.net/account/login'
    10         response = self.session.get(url=url, headers=self.headers)
    11         soup = BeautifulSoup(response.text, 'html.parser')
    12         lt = soup.find('input', {'name': 'lt'})['value']
    13         execution = soup.find('input', {'name': 'execution'})['value']
    14         soup.clear()
    15         return (lt, execution)
    16     def login(self, account, password):
    17         self.username = account
    18         self.password = password
    19         lt, execution = self.get_webflow()
    20         data = {
    21             'username': account,
    22             'password': password,
    23             'lt': lt,
    24             'execution': execution,
    25             '_eventId': 'submit'
    26         }
    27         url = 'http://passport.csdn.net/account/login'
    28         response = self.session.post(url=url, headers=self.headers, data=data)
    29         if (response.status_code == 200):
    30             print('正常')
    31         else:
    32             print('异常')
    33     def func(self):
    34         headers1={
    35             'Host':'write.blog.csdn.net',
    36             'Upgrade-Insecure-Requests':'1',
    37             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    38         }
    39         response=self.session.get(url='http://write.blog.csdn.net/postlist',headers=headers1,allow_redirects=False)
    40         print(response.text)
    41 if __name__ == '__main__':
    42     headers = {
    43         'Host': 'passport.csdn.net',
    44         'Origin': 'http://passport.csdn.net',
    45         'Referer':'http://passport.csdn.net/account/login',
    46         'Upgrade-Insecure-Requests':'1',
    47         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
    48     }
    49     csdn = CSDN(headers=headers)
    50     account = ''
    51     password = ''
    52     csdn.login(account=account, password=password)
    53     csdn.func()
    View Code
     1 #coding=utf-8  
     2 import requests  
     3 import re  
     4 import time  
     5 import json  
     6 from bs4 import BeautifulSoup as BS  
     7 import sys 
     8   
     9 headers = {  
    10     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',  
    11 }  
    12   
    13 def Get_Movie_URL():  
    14     urls = []  
    15     for i in range(1,11):  
    16         # 第一页的URL是不一样的,需要另外进行处理  
    17         if i != 1:  
    18             url = "http://www.mtime.com/top/movie/top100/index-%d.html" % i  
    19         else:  
    20             url = "http://www.mtime.com/top/movie/top100/"  
    21         r = requests.get(url=url,headers=headers)  
    22         soup = BS(r.text,'lxml')  
    23         movies = soup.find_all(name='a',attrs={'target':'_blank','href':re.compile('http://movie.mtime.com/(d+)/'),'class':not None})  
    24         for m in movies:  
    25             urls.append(m.get('href'))  
    26     return urls  
    27   
    28 def Create_Ajax_URL(url):  
    29     movie_id = url.split('/')[-2]  
    30     t = time.strftime("%Y%m%d%H%M%S0368", time.localtime())  
    31     ajax_url = "http://service.library.mtime.com/Movie.api?Ajax_CallBack=true&Ajax_CallBackType=Mtime.Library.Services&Ajax_CallBackMethod=GetMovieOverviewRating&Ajax_CrossDomain=1&Ajax_RequestUrl=%s&t=%s&Ajax_CallBackArgument0=%s" % (url,t,movie_id)  
    32     return ajax_url  
    33   
    34 def Crawl(ajax_url):  
    35     r = requests.get(url=ajax_url,headers=headers)  
    36     if r.status_code == 200:  
    37         r.encoding = 'utf-8'  
    38         result = re.findall(r'=(.*?);',r.text)[0]  
    39         if result is not None:  
    40             value = json.loads(result)  
    41   
    42             movieTitle = value.get('value').get('movieTitle')  
    43             TopListName = value.get('value').get('topList').get('TopListName')  
    44             Ranking = value.get('value').get('topList').get('Ranking')  
    45             movieRating = value.get('value').get('movieRating')  
    46             RatingFinal = movieRating.get('RatingFinal')  
    47             RDirectorFinal = movieRating.get('RDirectorFinal')  
    48             ROtherFinal = movieRating.get('ROtherFinal')  
    49             RPictureFinal = movieRating.get('RPictureFinal')  
    50             RStoryFinal = movieRating.get('RStoryFinal')  
    51             print(movieTitle)  
    52             if value.get('value').get('boxOffice'):  
    53                 TotalBoxOffice = value.get('value').get('boxOffice').get('TotalBoxOffice')  
    54                 TotalBoxOfficeUnit = value.get('value').get('boxOffice').get('TotalBoxOfficeUnit')  
    55                 print('票房:%s%s' % (TotalBoxOffice,TotalBoxOfficeUnit))  
    56             print('%s——No.%s' % (TopListName,Ranking))  
    57             print('综合评分:%s 导演评分:%s 画面评分:%s 故事评分:%s 音乐评分:%s' %(RatingFinal,RDirectorFinal,RPictureFinal,RStoryFinal,ROtherFinal))  
    58             print('****' * 20)  
    59   
    60 def main():  
    61     urls = Get_Movie_URL()  
    62     for u in urls:  
    63         Crawl(Create_Ajax_URL(u))  
    64   
    65     # 问题所在,请求如下单个电影链接时时不时会爬取不到数据  
    66     # Crawl(Create_Ajax_URL('http://movie.mtime.com/98604/'))  
    67   
    68 if __name__ == '__main__':  
    69     main() 
    View Code

     相关工具

    链接: https://pan.baidu.com/s/1oEw_MsaAWcMx7NQII6jXYg 密码: e6b6
    
    链接: https://pan.baidu.com/s/1fSppM-hK2x9Jk9RGqvRMqg 密码: 4q43
    
  • 相关阅读:
    使用淘宝Str2varlist与str2numlist 代替 in/exist ,提升性能(Oracle)
    由浅入深理解索引的实现
    你知道数据库索引的工作原理吗?
    深入理解数据库磁盘存储(Disk Storage)
    如何解析oracle执行计划
    Beyond Compare 4 最新中文版 注册码 key
    并发和并行的区别
    代码复用的规则
    Some Java exceptions, messages and errors.
    菜鸟学SSH(十六)——Struts2内部是如何工作的
  • 原文地址:https://www.cnblogs.com/navysummer/p/8808277.html
Copyright © 2011-2022 走看看