1 # 目标:抓取今日头条关键字美图 2 # 思路: 3 # 一、分析目标站点 4 # 二、构造ajax请求,用requests请求到索引页的内容,正则+BeautifulSoup得到索引url 5 # 三、对索引url请求,得到图片url与标题,下载并保存到数据库,本次使用MongDB 6 # 四、开启循环与多进程,对多页内容遍历与抓取 7 8 #问题一、为什么要构造请求 9 #为什么要构造请求,举个例子,第一屏的内容我们看到的实际url是: 10 # http://www.toutiao.com/search_content/?offset=20&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1 11 # 后面有一大串参数,这些参数就是请求的一些‘设定’,表示关键词,加载的页数,等等,是一个字典的形式, 12 # 如果人为去传这些数据显然十分繁琐,我们需要将这字典编码成一定格式加载请求函数里面。 13 import os 14 from json import JSONDecodeError 15 from multiprocessing.pool import Pool 16 17 import requests 18 from urllib.parse import urlencode 19 import json 20 import pymongo 21 22 from bs4 import BeautifulSoup 23 24 from requests.exceptions import RequestException 25 import re 26 from config import * 27 28 client = pymongo.MongoClient(MONGO_URL) 29 db = client[MONGO_DB] 30 31 def get_index_page(offset,keyword): 32 data = { 33 'offset': offset, 34 'format': 'json', 35 'keyword': keyword, 36 'autoload': 'true', 37 'count': '20', 38 'cur_tab': 1 39 } 40 data = urlencode(data) 41 url ='http://www.toutiao.com/search_content/?' + data 42 #print(url) 43 try: 44 response = requests.get(url) 45 if response.status_code == 200: 46 return response.text 47 else: 48 return None 49 except RequestException: 50 print('请求不到索引页面!') 51 return None 52 53 54 def parse_index_page(html): 55 56 #json_obj = json.dumps(html)#将Python对象序列化为json 57 #python_obj = json.loads(json_obj)#将json加载成Python对象 58 data = json.loads(html) 59 #在进行json操作之前有必要了解一下json是怎么操作的 60 if data and 'data' in data.keys(): 61 for item in data.get('data'): 62 yield item.get('article_url') 63 64 65 def get_detail_page(url): 66 try: 67 response = requests.get(url) 68 if response.status_code == 200: 69 return response.text 70 else: 71 return None 72 except RequestException: 73 return None 74 75 def save_to_mongo(result): 76 if db[MONG_TABLE].insert(result): 77 print('存储到MongoDB成功',result) 78 return True 79 else: 80 return False 81 82 def parse_detail_page(html,url): 83 soup = BeautifulSoup(html,'lxml') 84 title = soup.title.string 85 pattern = re.compile(r'var gallery = (.*?);',re.S) 86 result = re.findall(pattern,html) 87 if result: 88 images=[] 89 for i in result: 90 i = json.loads(i) 91 j = i.get("sub_images") 92 #print(j) 93 for k in j: 94 k = k.get('url') 95 images.append(k) 96 97 return{ 98 'title':title, 99 'url':url, 100 'images':images 101 } 102 103 def download_image(result): 104 image_list = result.get('images') 105 image_title = result.get('title') 106 print('正在下载:%s'%image_title) 107 108 if image_title not in os.listdir(path ='.'): 109 os.mkdir(image_title) 110 os.chdir(image_title) 111 for image in image_list: 112 try: 113 response = requests.get(image) 114 if response.status_code == 200: 115 filename = image.split('/')[-1] + '.jpg' 116 with open(filename,'wb') as f: 117 f.write(response.content) 118 print('正在下载:%s'%image) 119 120 else: 121 return None 122 except RequestException: 123 return None 124 os.chdir(os.pardir)#返回上一级目录 125 126 127 def main(offset): 128 129 html = get_index_page(offset,KEYWORDS) 130 for url in parse_index_page(html): 131 #print(url) 132 html = get_detail_page(url) 133 if html: 134 result = parse_detail_page(html,url) 135 if result: 136 #print(result) 137 #save_to_mongo(result) 138 download_image(result) 139 140 141 142 if __name__ == '__main__': 143 144 groups = [i*20 for i in range(GROUP_START,GROUP_END + 1)] 145 pool = Pool() 146 pool.map(main,groups)
1 #对比老司机所写 2 import json 3 import os 4 from urllib.parse import urlencode 5 import pymongo 6 import requests 7 from bs4 import BeautifulSoup 8 from requests.exceptions import ConnectionError 9 import re 10 from multiprocessing import Pool 11 from hashlib import md5 12 from json.decoder import JSONDecodeError 13 from config import * 14 15 client = pymongo.MongoClient(MONGO_URL, connect=False) 16 db = client[MONGO_DB] 17 18 19 def get_page_index(offset, keyword): 20 data = { 21 'autoload': 'true', 22 'count': 20, 23 'cur_tab': 3, 24 'format': 'json', 25 'keyword': keyword, 26 'offset': offset, 27 } 28 params = urlencode(data) 29 base = 'http://www.toutiao.com/search_content/' 30 url = base + '?' + params 31 try: 32 response = requests.get(url) 33 if response.status_code == 200: 34 return response.text 35 return None 36 except ConnectionError: 37 print('Error occurred') 38 return None 39 40 41 def download_image(url): 42 print('Downloading', url) 43 try: 44 response = requests.get(url) 45 if response.status_code == 200: 46 save_image(response.content) 47 return None 48 except ConnectionError: 49 return None 50 51 52 def save_image(content): 53 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') 54 print(file_path) 55 if not os.path.exists(file_path): 56 with open(file_path, 'wb') as f: 57 f.write(content) 58 f.close() 59 60 61 def parse_page_index(text): 62 try: 63 data = json.loads(text) 64 if data and 'data' in data.keys(): 65 for item in data.get('data'): 66 yield item.get('article_url') 67 except JSONDecodeError: 68 pass 69 70 71 def get_page_detail(url): 72 try: 73 response = requests.get(url) 74 if response.status_code == 200: 75 return response.text 76 return None 77 except ConnectionError: 78 print('Error occurred') 79 return None 80 81 82 def parse_page_detail(html, url): 83 soup = BeautifulSoup(html, 'lxml') 84 result = soup.select('title') 85 title = result[0].get_text() if result else '' 86 images_pattern = re.compile('var gallery = (.*?);', re.S) 87 result = re.search(images_pattern, html) 88 if result: 89 data = json.loads(result.group(1)) 90 if data and 'sub_images' in data.keys(): 91 sub_images = data.get('sub_images') 92 images = [item.get('url') for item in sub_images] 93 for image in images: download_image(image) 94 return { 95 'title': title, 96 'url': url, 97 'images': images 98 } 99 100 101 def save_to_mongo(result): 102 if db[MONGO_TABLE].insert(result): 103 print('Successfully Saved to Mongo', result) 104 return True 105 return False 106 107 108 def main(offset): 109 text = get_page_index(offset, KEYWORD) 110 urls = parse_page_index(text) 111 for url in urls: 112 html = get_page_detail(url) 113 result = parse_page_detail(html, url) 114 if result: save_to_mongo(result) 115 116 117 pool = Pool() 118 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)]) 119 pool.map(main, groups) 120 pool.close() 121 pool.join()