不登录下获取数据

1 # coding=utf-8 2 """ 3 用类封装爬虫任务, 4 目的,获取豆瓣某地区安热度排列的全部电影 5 思路: 6 chorme分析目标url, 7 构建url 8 发请求获取数据 9 保存数据 10 循环上三步直到最后一页 11 注意:目前代码中的url地址已经失效 12 """ 13 import requests 14 import json 15 16 class DoubanSpider: 17 def __init__(self): 18 self.url_temp_list = [ 19 { 20 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288", 21 "country": "US" 22 }, 23 { 24 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288", 25 "country": "UK" 26 }, 27 { 28 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288", 29 "country": "CN" 30 } 31 ] 32 self.headers = { 33 "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36", 34 "Referer": "https://m.douban.com/movie/" 35 } 36 37 def parse_url(self, url): # 发送请求,获取响应 38 print(url) 39 response = requests.get(url, headers=self.headers) 40 return response.content.decode() 41 42 def get_content_list(self, json_str): # 提取数据 43 dict_ret = json.loads(json_str) 44 content_list = dict_ret["subject_collection_items"] 45 total = dict_ret["total"] # 代表总数量 不一定正确 46 return content_list, total 47 48 def save_content_list(self, content_list,country): # 保存 49 with open("douban.txt", "a", encoding="utf-8") as f: 50 for content in content_list: 51 content["country"] = country 52 f.write(json.dumps(content, ensure_ascii=False)) 53 f.write(" ") # 写入换行符,进行换行 54 print("保存成功") 55 56 def run(self): # 实现主要逻辑 57 for url_temp in self.url_temp_list: 58 num = 0 # num是url中的start参数,表示起始页 59 total = 100 # 假设有第一页 60 while num < total + 18: # 不能等于,因为等于意味着上一次已经把最后一页取完了 61 # 1.start_url 62 url = url_temp["url_temp"].format(num) 63 # 2.发送请求,获取响应 64 json_str = self.parse_url(url) 65 # 3.提取是数据 66 content_list, total = self.get_content_list(json_str) 67 68 # 4.每一页都保存一下,而不是全部获取后再保存,防止中间出问题了,前面获取的都白费了。 69 self.save_content_list(content_list,url_temp["country"]) 70 # if len(content_list)<18: # 这种方式判断是否取到尾也可以 71 # break 72 # 5.构造下一页的url地址,进入循环 73 num += 18 74 75 76 if __name__ == '__main__': 77 douban_spider = DoubanSpider() 78 douban_spider.run()

1 # coding=utf-8 2 import requests 3 from retrying import retry 4 5 headers = { 6 "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36", 7 } 8 9 10 @retry(stop_max_attempt_number=3) 11 def _parse_url(url, method, data, proxies): 12 print("*" * 20) 13 if method == "POST": 14 response = requests.post(url, data=data, headers=headers, proxies=proxies) 15 else: 16 response = requests.get(url, headers=headers, timeout=3, proxies=proxies) 17 assert response.status_code == 200 18 return response.content.decode() 19 20 21 def parse_url(url, method="GET", data=None, proxies={}): 22 try: 23 html_str = _parse_url(url, method, data, proxies) 24 except Exception as e: 25 html_str = None 26 27 return html_str 28 29 30 if __name__ == '__main__': 31 url = "www.baidu.com" 32 print(parse_url(url))

1 from parse_url import parse_url 2 from lxml import etree 3 import json 4 """ 5 爬取任意贴吧 中帖子列表中的题目,评论数等,以及么个帖子中详情中每一页的图片,保存在文件中 6 爬取时间:2019/3 7 """ 8 9 class TieBa: 10 def __init__(self, name): 11 self.name = name 12 self.start_url = f"https://tieba.baidu.com/f?kw={name}&ie=utf-8&pn=0" 13 self.root_url = "https://tieba.baidu.com" 14 15 def etree_get_content(self, text): 16 html = etree.HTML(text) 17 li_list = html.xpath("//li[@class=' j_thread_list clearfix']") # 获取内容分组 18 data = [] 19 for i in li_list: 20 # print(etree.tostring(i).decode()) 21 item = {} 22 item["title"] = i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()")[0] if i.xpath( 23 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/text()") else None 24 item["comments"] = i.xpath(".//span[@class='threadlist_rep_num center_text']/text()")[0] if i.xpath( 25 ".//span[@class='threadlist_rep_num center_text']/text()") else None 26 item["href"] = self.root_url + i.xpath(".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href")[ 27 0] if i.xpath( 28 ".//div[@class='threadlist_title pull_left j_th_tit ']/a/@href") else None 29 item["imgs"] = self.get_img_list(item["href"], []) 30 data.append(item) 31 next_url = html.xpath(".//a[text()='下一页>']/@href")[0] if html.xpath(".//a[text()='下一页>']/@href") else None 32 return data, "https:" + next_url 33 34 def pre_html(self, text): 35 """ 36 去除注释符号 37 :return: 38 """ 39 text = text.replace('<!--', '') 40 return text.replace('--!>', '') 41 42 def get_img_list(self, next_url, container): 43 """递归爬取每一页贴吧详情页中发帖的图片""" 44 if next_url is None: 45 return container 46 detail_content = parse_url(next_url) 47 # 提取该页内容,提取下一页 48 html = etree.HTML(detail_content) 49 img_list = html.xpath("//img[@class='BDE_Image']/@src") # 获取图片src 列表 50 container.extend(img_list) 51 next_url = html.xpath(".//a[text()='下一页']/@href") # 下一页href 52 if next_url: # 不为空列表 53 # 这里return写不写都行,因为container是列表,可变类型,下一步return container最后返回也行,但是建议写上,这样除了最后一次循环外,每层都少走一步 54 return self.get_img_list(self.root_url + next_url[0], container) 55 return container 56 57 def save_content_dict(self, data): 58 file_path = self.name + ".txt" 59 with open(file_path, 'a+', encoding='utf8') as f: 60 for dd in data: 61 f.write(json.dumps(dd, ensure_ascii=False)) 62 f.write(' ') 63 64 def run(self): 65 # 第一页 66 next_url = self.start_url 67 # 循环获取每一页 68 while next_url is not None: 69 html_str = parse_url(next_url) 70 # 请求信息预处理 71 html_str = self.pre_html(html_str) 72 # 提取该页内容,提取下一页 73 data, next_url = self.etree_get_content(html_str) 74 print(data) 75 self.save_content_dict(data) 76 77 78 if __name__ == '__main__': 79 name = input("请输入贴吧名称:").strip() 80 ba = TieBa(name) 81 ba.run()

1 """ 2 抓取https://www.qiushibaike.com所有热门 中数据 3 爬取时间:2019/4 4 """ 5 from parse_url import parse_url 6 from lxml import etree 7 import json 8 9 10 class QiuShi: 11 12 def __init__(self): 13 self.start_url = "https://www.qiushibaike.com/8hr/page/{}/" # 根据规律构建全部url地址 14 self.part_url = "https://www.qiushibaike.com" 15 16 def etree_get_content(self, text): 17 etree_elemnt = etree.HTML(text) 18 # 先分组 19 content_list = etree_elemnt.xpath("//div[@class='recommend-article']/ul/li") 20 data = [] 21 for li in content_list: 22 item = {} 23 try: 24 item['title'] = li.xpath(".//a[@class='recmd-content']/text()")[0] if li.xpath(".//a[@class='recmd-content']/text()") else None 25 item['href'] = self.part_url + li.xpath(".//a[@class='recmd-content']/@href")[0] if li.xpath(".//a[@class='recmd-content']/@href") else None 26 item['laugh_num'] = li.xpath(".//div[@class='recmd-num']/span[1]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None 27 item['comment_num'] = li.xpath(".//div[@class='recmd-num']/span[4]/text()")[0] if li.xpath(".//div[@class='recmd-num']/span[4]/text()") else None 28 except Exception as e: 29 print(e) 30 continue 31 data.append(item) 32 return data 33 34 def save_content_dict(self, data): 35 file_path = "糗事百科热门.txt" 36 with open(file_path, 'a+', encoding='utf8') as f: 37 for dd in data: 38 f.write(json.dumps(dd, ensure_ascii=False)) 39 f.write(' ') 40 41 def run(self): 42 # 构建url地址列表 43 for i in range(1, 14): 44 # 获取每一页目标响应 45 html_str = parse_url(self.start_url.format(i)) 46 # 解析页面 47 data = self.etree_get_content(html_str) 48 # 每一页保存一次 49 self.save_content_dict(data) 50 51 52 if __name__ == '__main__': 53 q = QiuShi() 54 q.run()

1 """ 2 多线程 抓取https://www.qiushibaike.com所有热门 中数据, 3 但是该网站布局已经改版了,部分xpath解析的位置已经不在了 4 爬取时间:2017/10 5 """ 6 import requests 7 from lxml import etree 8 import threading 9 from queue import Queue 10 11 12 class QiubaiSpdier: 13 def __init__(self): 14 self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/" 15 self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"} 16 self.url_queue = Queue() 17 self.html_queue = Queue() 18 self.content_queue = Queue() 19 20 def get_url_list(self): 21 # return [self.url_temp.format(i) for i in range(1,14)] 22 for i in range(1,4): 23 self.url_queue.put(self.url_temp.format(i)) 24 25 def parse_url(self): 26 while True: 27 url = self.url_queue.get() 28 print(url) 29 response = requests.get(url,headers=self.headers) 30 self.html_queue.put(response.content.decode()) 31 self.url_queue.task_done() # 注意必须url的get并处理好url的响应put到对应的队列后,再调用url的task_done使计数减一 32 33 def get_content_list(self): # 提取数据 34 while True: 35 html_str = self.html_queue.get() 36 37 html = etree.HTML(html_str) 38 div_list = html.xpath("//div[@id='content-left']/div") #分组 39 content_list = [] 40 for div in div_list: 41 item= {} 42 item["content"] = div.xpath(".//div[@class='content']/span/text()") 43 item["content"] = [i.replace(" ","") for i in item["content"]] 44 item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class") 45 item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon","") if len(item["author_gender"])>0 else None 46 item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()") 47 item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"])>0 else None 48 item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src") 49 item["content_img"] = "https:"+item["content_img"][0] if len(item["content_img"])>0 else None 50 item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()") 51 item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None 52 content_list.append(item) 53 self.content_queue.put(content_list) 54 self.html_queue.task_done() # 注意task_done放在put后面,确保get的结果处理完并且已经put都对应的队列中 55 56 def save_content_list(self): # 保存 57 while True: 58 content_list = self.content_queue.get() 59 for i in content_list: 60 # print(i) 61 pass 62 self.content_queue.task_done() 63 64 def run(self): #实现主要逻辑 65 thread_list = [] 66 #1.url_list 67 t_url = threading.Thread(target=self.get_url_list) 68 thread_list.append(t_url) 69 #2.遍历,发送请求,获取响应 70 for i in range(20): 71 t_parse = threading.Thread(target=self.parse_url) 72 thread_list.append(t_parse) 73 #3.提取数据 74 for i in range(2): 75 t_html = threading.Thread(target=self.get_content_list) 76 thread_list.append(t_html) 77 #4.保存 78 t_save = threading.Thread(target=self.save_content_list) 79 thread_list.append(t_save) 80 for t in thread_list: 81 t.setDaemon(True) # 把子线程设置为守护线程,主线程结束,子线程结束 82 t.start() 83 84 for q in [self.url_queue,self.html_queue,self.content_queue]: 85 # 调用此方法让主线程阻塞,直到队列中所有的项目均被处理。阻塞将持续到队列中的每个项目均调用q.task_done()方法为止 86 q.join() 87 88 if __name__ == '__main__': 89 qiubai = QiubaiSpdier() 90 qiubai.run() 91 print("主线程结束")

1 """ 2 第一页: 3 Request URL: https://www.douyu.com/directory/all 4 Request Method: GET 5 6 7 第二页: 8 Request URL: https://www.douyu.com/gapi/rkc/directory/0_0/2 9 Request Method: GET 10 11 爬取时间:2019/4 12 """ 13 import json 14 import time 15 from retrying import retry 16 from selenium import webdriver 17 from selenium.webdriver.chrome.options import Options 18 19 chrome_options = Options() 20 chrome_options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 21 22 class DouYuSpider: 23 def __init__(self): 24 self.start_url = 'https://www.douyu.com/directory/all' 25 self.driver = webdriver.Chrome(chrome_options=chrome_options) 26 27 def save_content_dict(self, data): 28 file_path = 'douyu-room' + ".txt" 29 with open(file_path, 'a+', encoding='utf8') as f: 30 for dd in data: 31 f.write(json.dumps(dd, ensure_ascii=False)) 32 f.write(' ') 33 f.flush() 34 35 @retry(stop_max_attempt_number=3) 36 def get_next_page_click(self): 37 next_page = self.driver.find_elements_by_xpath("//li[@class=' dy-Pagination-next']/span") 38 # 最后一页的 下一页 父元素 class=‘dy-Pagination-disabled dy-Pagination-next’ 表示不可点击了 39 if len(next_page) == 0: 40 return -1 41 else: 42 next_page[0].click() 43 44 def get_single_page(self): 45 # 先分组 46 room_list = self.driver.find_elements_by_xpath( 47 "//div[@class='layout-Module-container layout-Cover ListContent']/ul/li") 48 data = [] 49 for room in room_list: 50 item = {} 51 item['title'] = room.find_element_by_xpath(".//h3[@class='DyListCover-intro']").text 52 item['zone'] = room.find_element_by_xpath(".//span[@class='DyListCover-zone']").text 53 # item['img'] = room.find_element_by_xpath(".//img[@class='DyImg-content is-normal']").get_attribute( 54 # 'src') 55 item['anchor_name'] = room.find_element_by_xpath(".//h2[@class='DyListCover-user']").text 56 data.append(item) 57 return data 58 59 60 def run(self): 61 # 第一页 62 self.driver.get(self.start_url) 63 self.driver.implicitly_wait(12) 64 65 while True: 66 # 获取每一页的页面结构化数据 67 data = self.get_single_page() 68 # 保存数据 69 self.save_content_dict(data) 70 # 查找下一页url,并点击 71 try: 72 ret = self.get_next_page_click() 73 time.sleep(2) # 等待页面加载完全 74 if ret == -1: 75 break 76 except Exception as e: 77 print(e) 78 79 self.driver.quit() 80 81 82 if __name__ == '__main__': 83 douyu = DouYuSpider() 84 douyu.run() 85 86 """ 87 优化建议: 88 1.把每一页的self.driver.page_source 页面字符串传给lxml的etree去处理 89 2.staleness_of 尝试失败 https://www.mlln.cn/2018/05/22/python-selenium如何在点击后等待页面刷新 90 91 """
自动登录案例

1 """ 2 套路:登录首页的时候,已经给浏览器设置cookies,此时未激活 3 登录成功后返回假的cookies,激活未激活的cookies, 4 5 """ 6 import requests 7 from bs4 import BeautifulSoup 8 9 headers = { 10 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 11 } 12 13 index = requests.get("https://dig.chouti.com/", headers=headers) 14 cookies = index.cookies.get_dict() 15 16 17 # ===========================点赞================= 18 19 # 1.登录 20 login = requests.post( 21 "https://dig.chouti.com/login", 22 data={ 23 "phone": 8615026809593, 24 "password":'dajiahaa', 25 }, 26 headers=headers, 27 cookies=cookies) 28 29 # 2.点赞 30 dizan = requests.post( 31 url="https://dig.chouti.com/link/vote?linksId=25389911", 32 cookies=cookies, 33 headers=headers) 34 35 print(dizan.text)

1 """ 2 套路: 3 - 带请求头 4 - 带cookie 5 - 请求体中: 6 commit:Sign in 7 utf8:✓ 8 authenticity_token:放在页面隐藏表单中 9 login:asdfasdfasdf 10 password:woshiniba8 11 12 """ 13 import requests 14 from bs4 import BeautifulSoup 15 16 headers = { 17 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 18 } 19 20 login = requests.get( 21 "https://github.com/login", 22 headers=headers, 23 ) 24 cookies = login.cookies.get_dict() 25 login_par = BeautifulSoup(login.content, 'html.parser') 26 token_input = login_par.find(name='input', attrs={"name": "authenticity_token"}) 27 28 authenticity_token = token_input.attrs.get("value") 29 # 1.登录 30 re_login = requests.post( 31 "https://github.com/session", 32 data={ 33 "commit": "Sign in", 34 "utf8":"✓", 35 "login": "cpcp@163.com", 36 "password": 'cs11187', 37 "authenticity_token": authenticity_token, 38 "webauthn-support": "supported" 39 }, 40 cookies=cookies, 41 headers={ 42 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36", 43 "Referer": "https://github.com/login" 44 } 45 ) 46 47 print(re_login.text)