zoukankan      html  css  js  c++  java
  • 食品伙伴网爬虫

    常规爬虫,就是下载pdf文件

    码云链接:https://gitee.com/MarkPolaris/food_partnership_network/tree/master

    概览页

     1 import requests
     2 import re
     3 import pymysql
     4 import hashlib
     5 import datetime
     6 
     7 
     8 class GLY(object):
     9     def __init__(self):
    10         self.url = 'http://down.foodmate.net/special/standard/8.html'
    11         self.headers = {
    12             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    13         }
    14         self.host = '127.0.0.1'
    15         self.db = 'app_mark'
    16         self.user = 'root'
    17         self.passwd = '123456'
    18         self.charset = 'utf8mb4'
    19 
    20     def get_url(self):
    21         response = requests.get(self.url, headers=self.headers)
    22         response.encoding = response.apparent_encoding
    23         html = response.text
    24         urls = re.findall('<A title=.*?href="(.*?)"', html)
    25         # 去重
    26         urls = set(urls)
    27         for url in urls:
    28             hkey = hashlib.md5(url.encode(encoding='utf-8')).hexdigest()
    29             tag = '0'
    30             channel = '食品添加剂标准'
    31             sitename = '食品伙伴网'
    32             lasttime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    33             list_data = [url, hkey, tag, channel, sitename, lasttime]
    34             self.save_url(list_data)
    35         print(len(urls))
    36 
    37     def save_url(self, list_data):
    38         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
    39         cur = con.cursor()
    40         sql = 'insert into gly(link, hkey, tag, channel, sitename, lasttime) values (%s, %s, %s, %s, %s, %s)'
    41         try:
    42             cur.execute(sql, list_data)
    43             print('insert success')
    44         except Exception as e:
    45             con.rollback()
    46             print('error~', e)
    47         else:
    48             con.commit()
    49         cur.close()
    50         con.close()
    51         
    52 
    53 
    54 if __name__ == '__main__':
    55     gly = GLY()
    56     urls = gly.get_url()

    细览页

     1 import pymysql
     2 import re
     3 import datetime
     4 import requests
     5 from multiprocessing.dummy import Pool as ThreadPool
     6 
     7 class XLY(object):
     8     def __init__(self):
     9         self.host = '127.0.0.1'
    10         self.db = 'app_mark'
    11         self.user = 'root'
    12         self.passwd = '123456'
    13         self.charset = 'utf8mb4'
    14         self.headers = {
    15             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
    16         }
    17         self.start = datetime.datetime.now()
    18 
    19     def get_urls(self):
    20         con = pymysql.connect(host=self.host, db=self.db, user=self.user, passwd=self.passwd, charset=self.charset)
    21         cur = con.cursor()
    22         sql = 'select link from gly where tag = "0" and sitename = "食品伙伴网"'
    23         after_sql = 'update gly set tag = "1"'
    24         try:
    25             cur.execute(sql)
    26             results = cur.fetchall()
    27         except Exception as e:
    28             con.rollback()
    29             print('error~', e)
    30             results = None
    31         else:
    32             con.commit()
    33         cur.close()
    34         con.close()
    35         return results
    36 
    37     def download(self, url):
    38         url = url[0]
    39         response = requests.get(url, headers=self.headers)
    40         response.encoding = response.apparent_encoding
    41         html = response.text
    42         down_url = re.findall('<a class="telecom" href="(.*?)">', html, re.S)
    43         try:
    44             down_url = down_url[0]
    45             r = requests.get(down_url, headers=self.headers)
    46             file_name = 'D:/1_work/python采集/PDF/' + down_url.split('auth=')[-1] + '.pdf'
    47             # print(file_name)  
    48             with open(file_name, 'wb') as pdf:
    49                 for content in r.iter_content():
    50                     pdf.write(content)
    51         except Exception as e:
    52             print('error_url:{}; exception: {}'.format(url, e))
    53         print(down_url)
    54 
    55 
    56 if __name__ == '__main__':
    57     xly = XLY()
    58     urls = xly.get_urls()
    59     if urls:
    60         # 多线程
    61         pool = ThreadPool(20)
    62         pool.map(xly.download, urls)
    63         pool.close()
    64         pool.join()
    65     end = datetime.datetime.now()
    66     print('耗时: {}'.format(end - xly.start))
    67         # for url in urls:
    68             # url = url[0]
    69             # xly.download(url)
    70             # break
  • 相关阅读:
    有关TensorBoard一些小Tip和实例
    基于Word2Vec的影评挖掘
    CNN实战2:CIFAR-10数据集上的图像识别
    CNN实战1:实现模仿大师绘画
    delphi
    表格录入控件
    税控接口
    TStringGrid
    TStringGrid
    sqlserver
  • 原文地址:https://www.cnblogs.com/MC-Curry/p/10561068.html
Copyright © 2011-2022 走看看