""" 使用Requests库完成Post表单操作 """ #_*_codingn:utf8 _*_ import requests from bs4 import BeautifulSoup ''' 设置请求头,让程序发出的请求更像来源于浏览器 ''' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"} if __name__ == "__main__": params ={"username": "anything","password": "password"} session =requests.session() post_obj = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params) s = session.get("http://pythonscraping.com/pages/cookies/profile.php") print(post_obj.text.encode("utf-8")) print(s.text.encode("utf-8")) #session.cookies.get_dict() #获取cooking print(session.cookies.get_dict())
# -*- coding: utf-8 -*- ''' 目标站点分析 网页结构分析 --开干-- 1、单页内容 2、正则 3、保存json 4、多线程循环 ''' # .*具有贪婪的性质,首先匹配到不能匹配为止,根据后面的正则表达式,会进行回溯。 # .*?(短)则相反,一个匹配以后,就往下进行,所以不会进行回溯,具有最小匹配的性质。 # re.S 让.匹配换行符 #---------------------------------- import json import requests from requests.exceptions import RequestException import re import time from multiprocessing import Pool headers = { # 非常重要 'Accept-Language': 'en-US,en;q=0.8', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36', 'Connection': 'keep-alive', 'Referer': 'http://maoyan.com/board/6' } def get_one_page(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None # 非200 except RequestException: return None def parse_one_page(html): pattern = re.compile('<dd>.*?board-index.*?>(d+)</i>.*?data-src="(.*?)".*?name"><a' + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S) items = re.findall(pattern, html) for item in items: yield { # 变成生成器 'index': item[0], 'image': item[1], 'title': item[2], 'actor': item[3].strip()[3:], # 字符串处理 (移除字符串头尾指定的字符序列) 'time': item[4].strip()[5:], 'score': item[5] + item[6] # 分开匹配加起来 } def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: # 编码3 f.write(json.dumps(content, ensure_ascii=False) + ' ') # json.dumps 序列化时对中文默认使用的ascii编码 def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) # return返回参数 # print(html) for item in parse_one_page(html): # print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i * 10) time.sleep(1) # 进程池 # pool=Pool() # pool.map(main,[i*10 for i in range(10)])
# coding=utf-8 ''' 1、抓取索引页内容 2、抓取详情页内容 3、下载图片保存数据库 4、循环及多线程 ''' import requests from requests.exceptions import RequestException from json import loads from bs4 import BeautifulSoup user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)" headers = {"User-Agent": user_agent} def get_onepage_index(i, keywords): data = { "offset": i, "format": "json", "keyword": keywords, "autoload": "true", "count": "20", "cur_tab": "1", "from": "search_tab" } url = 'https://www.toutiao.com/search_content/?' try: response = requests.get(url, params=data) if response.status_code == 200: return response.text return None except RequestException: print('something is wrong!') return None def parse_onepage_index(html): # json.loads()用于将str类型的数据转成dict。 data = loads(html) if data and 'data' in data.keys(): ##获取所有的key 值 for item in data.get('data'): # get() 函数返回指定键的值,如果值不在字典中返回默认值。 yield item.get('article_url') def get_page_detail(url): try: response = requests.get(url, headers=headers) if response.status_code == 200: # print(response.status_code) return response.text return None except RequestException: print('wrong url:', url) return None def parsepage(html): soup = BeautifulSoup(html, 'lxml') title = soup.title.string print(title) def main(): for i in range(1, 2): i = str(i * 20) html = get_onepage_index(i, '街拍') parse_onepage_index(html) for url in parse_onepage_index(html): print(url) detailhtml = get_page_detail(url) # 返回网页文本 # print(detailhtml) if detailhtml == None: pass else: parsepage(detailhtml) # bs4去解析 # get_page_detail('http://toutiao.com/group/6596305324645286404/') if __name__ == '__main__': main()
如有疑问,请留言。
如觉得有帮助,请点个赞,谢谢!