zoukankan      html  css  js  c++  java
  • 搜狗图片抓取,主要以图集类进行抓取

    搜狗图集抓取,类型比较多,但是数据量比较少

    import os
    import time
    from concurrent.futures.thread import ThreadPoolExecutor
    import requests
    import re
    import json
    from urllib import parse
    from pymongo import MongoClient
    
    
    class Save:
        def __init__(self, host):
            self.client = MongoClient(host=host, port=27017)
            self.db = self.client.ImageSet
    
        def _save_data_mongodb(self, collect_name, data):
            self.collect_name = self.db[collect_name]
            history_record = self.collect_name.find_one({"_id": data['id']})
            if history_record:
                return True
            else:
                self.collect_name.update_one({'_id': data['id']}, {'$set': data}, upsert=True)
                return True
    
    
    class SouHu:
        def __init__(self, category):
            self.category_name = category
            self.category = parse.quote(category)
            self.image_url_temp = "https://pic.sogou.com/pics/imageddetail2013.jsp?k="+self.category+"&tc=&t=&id=0&d={}"
            self.start_url = "https://pic.sogou.com/pics?query="+self.category+"&mode=8&dm=11&leftp=44230502&cwidth=1024&cheight=768&st=0&start={}&reqType=ajax&reqFrom=result&tn=0"
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
            }
    
        def get_title_id_grpdocs(self, url):
            print(url)
            response = requests.get(url=url, headers=self.headers)
            json_response = json.loads(response.text)
            json_dict = json_response.get('items')
            image_content = []
            for i in json_dict:
                item={}
                try:
                    id = i['mf_id']
                    title = i['title']
                    grpdocs = i['grpdocs']
                    rule = re.compile(r's+')
                    grpdocs = rule.sub(',', grpdocs)
                    item['id'] = id
                    item['title'] = title
                    item['ImageUrl'] = self.image_url_temp.format(grpdocs)
                    image_content.append(item)
                except Exception as e:
                    print(e)
                    continue
            print(image_content)
            return image_content
    
        def get_save_content(self, image_content):
            save_content = []
            for image in image_content:
                item={}
                url = image['ImageUrl']
                response = requests.get(url=url, headers=self.headers)
                json_response = json.loads(response.text)
                image_list = []
                for i in json_response:
                    try:
                        image_url = i['pic_url']
                    except:
                        try:
                            image_url = i['ori_pic_url']
                        except:
                            continue
                    image_list.append(image_url)
                item['id'] = image['id']
                item['title'] = image['title']
                item['url'] = image_list
                save_content.append(item)
            print(save_content)
            return save_content
    
        def save_(self,save_content):
            upload_time = time.strftime("%Y-%m-%d", time.localtime())
            print("开始写入")
            for i in save_content:
                if len(i['url']) < 3:
                    continue
                collect_name = "搜狗图片"
                result = Save("localhost")._save_data_mongodb(collect_name, data=i)
                if result:
                    try:
                        rule = re.compile(r's*', re.S)
                        rule2 = re.compile(r'W*', re.S)
                        title = rule.sub('', i['title'])
                        title = rule2.sub('', title)
                        path = 'D:/搜狗/'+self.category_name+'/' + str(upload_time) + '/' + title
                    except Exception as e:
                        print(e)
                        continue
                    if os.path.exists(path):
                        continue
                    else:
                        os.makedirs(path)
                    try:
                        with open(path + '/content.txt', 'w', encoding='utf8')as fb:
                            fb.write(str([i['title']]))
                        for s in i['url']:
                            a = i['url'].index(s)
                            with open(path + '/{}.jpg'.format(str(a)), 'wb') as f:
                                print(s)
                                response = requests.get(url=s)
                                f.write(response.content)
                    except Exception as e:
                        print(e)
                        continue
                    print(title+"   写入完成")
                else:
                    continue
    
        def run(self, num):
            url = self.start_url.format(num)
            image_content = self.get_title_id_grpdocs(url)
            save_content = self.get_save_content(image_content)
            self.save_(save_content)
    
    
    if __name__ == '__main__':
        category = input("输入分类名称:")
        with ThreadPoolExecutor(10) as executor:
            sh = SouHu(category)
            for num in range(2400):
                executor.submit(sh.run, num)
    

      

  • 相关阅读:
    053(五十六)
    【leetcode❤python】 Maximum Depth of Binary Tree
    【leetcode❤python】Find the Difference
    【leetcode❤python】Binary Watch
    【leetcode❤python】Convert a Number to Hexadecimal
    【leetcode❤python】83. Remove Duplicates from Sorted List
    【leetcode❤python】66. Plus One
    【leetcode❤python】70. Climbing Stairs
    【leetcode❤python】409. Longest Palindrome
    【leetcode❤python】387. First Unique Character in a String
  • 原文地址:https://www.cnblogs.com/lqn404/p/11194612.html
Copyright © 2011-2022 走看看