google图片抓取
google图片是base64加密的,而且base64后的信息放在script信息里面
import pymysql from lxml import etree import logging import requests import time import threading from threading import RLock import re import os lock = RLock() import base64 import ssl ssl._create_default_https_context = ssl._create_unverified_context # 添加日志 logging.basicConfig( level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出 format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式 datefmt='%Y-%m-%d %H:%M:%S', # 时间 filename='drugimagesError.log', # log文件名 filemode='a') # 写入模式“w”或“a” class google_images(object): def __init__(self): self.strat_record = 1 self.end_record = 10000001 self.db = pymysql.connect(host='localhost', port=3306, database='yao_zhi', user='root', password='root', charset='utf8') self.cursor = self.db.cursor() self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"} while True: self.parse_page() def parse_page(self): lock.acquire() num = self.cursor.execute( "select id, me_pizhunwenhao, me_name, me_jixing, me_key from guo_cai_jin_kou_yao_pin where id > {} limit 1000".format( self.strat_record)) lock.release() if str(num) == str(0): exit() data_tuple = self.cursor.fetchall() threading_list = [] for data_one in data_tuple: id = data_one[0] approvalNumber = data_one[1] drugName = data_one[2] dosageForm = data_one[3] try: specifications = re.findall(r".+?,", data_one[4])[0] except: specifications = data_one[4] self.strat_record = id logging.info("id:%s approvalNumber:%s drugName:%s dosageForm:%s specifications:%s" % ( id, approvalNumber, drugName, dosageForm, specifications)) print("id:%s approvalNumber:%s drugName:%s dosageForm:%s specifications:%s" % ( id, approvalNumber, drugName, dosageForm, specifications)) if str(id) == str(self.end_record): exit() lock.acquire() num = self.cursor.execute("select id from drugimages where approvalNumber = '{}' ".format(approvalNumber)) lock.release() if not num: t = threading.Thread(target=self.parse_page_data, args=(id, approvalNumber, drugName, dosageForm, specifications,)) t.start() threading_list.append(t) time.sleep(3) for t_one in threading_list: t_one.join() def parse_page_data(self, id, approvalNumber, drugName, dosageForm, specifications): print("id:%s approvalNumber:%s drugName:%s specifications:%s" % ( id, approvalNumber, drugName, specifications)) keyword = drugName + ' ' + dosageForm + ' ' + specifications url = 'https://www.google.com/search?biw=1920&bih=900&tbm=isch&q=%s' % keyword print(url) data_particular = etree.HTML(requests.get(url=url, headers=self.headers).content) images_list = data_particular.xpath('//span[@id="xjs"]/script/text()')[0] images_list_link = re.findall(r'data:image/jpeg;base64,/9j/4AAQSkZJRgABAQAAAQABAAD.*"]?', images_list) num = 0 for link in images_list_link: num += 1 url_link = link.replace('"]', '') image = url_link.encode('utf-8').decode('unicode_escape') image_data = image.replace('data:image/jpeg;base64,', '') drugsql = "insert into drugimages(approvalNumber, drugName, specifications, image, num, durgid) values('{}', '{}', '{}', '{}', {}, {})" drugsql_data = drugsql.format(approvalNumber, drugName, specifications, image, int(num), int(id)) print('sql_data:%s' % drugsql_data) logging.info("id:%s approvalNumber:%s drugName:%s specifications:%s" % ( id, approvalNumber, drugName, specifications)) lock.acquire() self.cursor.execute(drugsql_data) self.db.commit() lock.release() pic_content = base64.b64decode(image_data) page_id = int(id / 1000) file = './images/' + 'page' + str(page_id) + '/' if not os.path.exists(file): os.makedirs(file) files = file + 'id' + str(id) + '/' if not os.path.exists(files): os.makedirs(files) file = open(files + str(approvalNumber) + '-' + str(num) + '.jpg', 'wb') file.write(pic_content) file.close() if str(num) == str(30): break if __name__ == '__main__': google_images()