zoukankan      html  css  js  c++  java
  • google image

    google图片抓取

    google图片是base64加密的,而且base64后的信息放在script信息里面

    import pymysql
    from lxml import etree
    import logging
    import requests
    import time
    import threading
    from threading import RLock
    import re
    import os
    
    lock = RLock()
    import base64
    import ssl
    
    ssl._create_default_https_context = ssl._create_unverified_context
    
    # 添加日志
    
    logging.basicConfig(
        level=logging.INFO,  # 定义输出到文件的log级别,大于此级别的都被输出
        format='%(asctime)s  %(filename)s  %(levelname)s : %(message)s',  # 定义输出log的格式
        datefmt='%Y-%m-%d %H:%M:%S',  # 时间
        filename='drugimagesError.log',  # log文件名
        filemode='a')  # 写入模式“w”或“a”
    
    
    class google_images(object):
    
        def __init__(self):
            self.strat_record = 1
            self.end_record = 10000001
            self.db = pymysql.connect(host='localhost', port=3306, database='yao_zhi', user='root', password='root',
                                      charset='utf8')
            self.cursor = self.db.cursor()
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
    
            while True:
                self.parse_page()
    
        def parse_page(self):
            lock.acquire()
            num = self.cursor.execute(
                "select id, me_pizhunwenhao, me_name, me_jixing, me_key from guo_cai_jin_kou_yao_pin where id > {} limit 1000".format(
                    self.strat_record))
            lock.release()
            if str(num) == str(0):
                exit()
    
            data_tuple = self.cursor.fetchall()
            threading_list = []
            for data_one in data_tuple:
                id = data_one[0]
                approvalNumber = data_one[1]
                drugName = data_one[2]
                dosageForm = data_one[3]
                try:
                    specifications = re.findall(r".+?,", data_one[4])[0]
                except:
                    specifications = data_one[4]
                self.strat_record = id
                logging.info("id:%s  approvalNumber:%s   drugName:%s   dosageForm:%s   specifications:%s" % (
                    id, approvalNumber, drugName, dosageForm, specifications))
                print("id:%s  approvalNumber:%s   drugName:%s   dosageForm:%s   specifications:%s" % (
                    id, approvalNumber, drugName, dosageForm, specifications))
    
                if str(id) == str(self.end_record):
                    exit()
                lock.acquire()
                num = self.cursor.execute("select id from drugimages where approvalNumber = '{}' ".format(approvalNumber))
                lock.release()
                if not num:
                    t = threading.Thread(target=self.parse_page_data,
                                         args=(id, approvalNumber, drugName, dosageForm, specifications,))
                    t.start()
                    threading_list.append(t)
                    time.sleep(3)
    
            for t_one in threading_list:
                t_one.join()
    
        def parse_page_data(self, id, approvalNumber, drugName, dosageForm, specifications):
            print("id:%s  approvalNumber:%s   drugName:%s   specifications:%s" % (
                id, approvalNumber, drugName, specifications))
            keyword = drugName + ' ' + dosageForm + ' ' + specifications
            url = 'https://www.google.com/search?biw=1920&bih=900&tbm=isch&q=%s' % keyword
            print(url)
            data_particular = etree.HTML(requests.get(url=url, headers=self.headers).content)
            images_list = data_particular.xpath('//span[@id="xjs"]/script/text()')[0]
            images_list_link = re.findall(r'.*"]?', images_list)
    
            num = 0
            for link in images_list_link:
                num += 1
                url_link = link.replace('"]', '')
                image = url_link.encode('utf-8').decode('unicode_escape')
                image_data = image.replace('data:image/jpeg;base64,', '')
                drugsql = "insert into drugimages(approvalNumber, drugName, specifications, image, num, durgid) values('{}', '{}', '{}', '{}', {}, {})"
                drugsql_data = drugsql.format(approvalNumber, drugName, specifications, image, int(num), int(id))
                print('sql_data:%s' % drugsql_data)
                logging.info("id:%s  approvalNumber:%s   drugName:%s   specifications:%s" % (
                    id, approvalNumber, drugName, specifications))
                lock.acquire()
                self.cursor.execute(drugsql_data)
                self.db.commit()
                lock.release()
                pic_content = base64.b64decode(image_data)
                page_id = int(id / 1000)
                file = './images/' + 'page' + str(page_id) + '/'
                if not os.path.exists(file):
                    os.makedirs(file)
                files = file + 'id' + str(id) + '/'
                if not os.path.exists(files):
                    os.makedirs(files)
                file = open(files + str(approvalNumber) + '-' + str(num) + '.jpg', 'wb')
                file.write(pic_content)
                file.close()
                if str(num) == str(30):
                    break
    
    
    if __name__ == '__main__':
        google_images()
    

      

  • 相关阅读:
    robotium问答
    android Instrumentoation 问答
    Zookeeper 简介
    JVM 监控工具——jstatd
    JVM 监控工具——jstack
    JVM 监控工具——jps
    TCP 连接状态
    Zookeeper 安装及命令行操作
    MySQL 修改密码和设置远程连接
    Linux 相关系统日志查看
  • 原文地址:https://www.cnblogs.com/yoyo1216/p/10144493.html
Copyright © 2011-2022 走看看