zoukankan      html  css  js  c++  java
  • 多线程爬取房天下数据,并且存储到mysql(增加爬取日志输出模块)

    下面是源代码,在调试代码的过程中,发现用mysql存储特别慢,最好用mongodb或者redis,后面将会推出协程和线程搭配爬取数据

      1 # -*- coding: utf-8 -*-
      2 
      3 import requests,time,urllib.request,os,re,xlwt
      4 import threading,random,threadpool
      5 import pymongo,pymysql,logging
      6 from multiprocessing import Process
      7 from lxml import etree
      8 from pymongo import MongoClient
      9 import log
     10  
     11 user_agent_list = [ 
     12           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,
     13           "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
     14           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
     15           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
     16           "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
     17           "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
     18           "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
     19           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     20           "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     21           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     22           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     23           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     24           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     25           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     26           "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     27           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
     28           "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
     29           "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
     30 
     31       ]
     32 url = 'http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9'
     33 workbook = xlwt.Workbook()
     34 sheet = workbook.add_sheet("Sheet Name")
     35 
     36 #sheet.write(0, 2, 'foobar')# row, column, value
     37 
     38 #workbook.save("foobar.xls")
     39 
     40 #links = re.findall('"((http|ftp)s?://.*?)"', str(html.text))#获取网站所有url的正则表达式
     41 
     42 #client = MongoClient('localhost',int(27017))#链接数据库
     43 
     44 class Ft(object):
     45     def save_mysql(self,d_t):
     46         for i in d_t:
     47             for ii in i:
     48                     lk = str(i[0])
     49                     ad = str(i[1])
     50                     ade = str(i[2])
     51                     pe = str(i[3])
     52                     phe = str(i[4])
     53                     conn = pymysql.connect(host='192.168.191.1', user='root', passwd='123456789', db='data', port=3306,
     54                                            charset='utf8')
     55                     cur = conn.cursor()  # 获取一个游标
     56                     sql = '''INSERT INTO ftx(link,adr,adress,price,phone)VALUES("%s","%s","%s","%s","%s")''' %(lk, ad, ade, pe, phe)
     57                     cur.execute(sql)
     58                     data = cur.fetchall()
     59                     cur.close()  # 关闭游标
     60                     conn.commit()  # 事务提交
     61                     conn.close()  # 释放数据库资源
     62 
     63     def get_data(self,url):
     64         headers={}
     65         addr = []
     66         url_2 = 'http://newhouse.gz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.gz.xf_search.page.6'
     67         url_1 = 'http://newhouse.sz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.sz.xf_search.page.9'
     68         headers['User-Agent'] = random.choice(user_agent_list)
     69         try:
     70             html = requests.get(url_2, headers=headers)
     71             html.encoding = 'gbk'
     72             if html.status_code == 200:
     73                 log.kk('下载网页数据成功')
     74             else:
     75                 print('下载失败!!!')
     76         except requests.exceptions.ReadTimeout as e:
     77             log.gg.kk(e)
     78         selector = etree.HTML(str(html.text))
     79         links = selector.xpath('//div[@class="nlc_img"]/a/@href')
     80         addrnames = selector.xpath('//div[@class="nlcd_name"]/a/text()')
     81         for i in addrnames:
     82             addr.append(i.strip())
     83         addrs = selector.xpath('//div[@class="address"]/a/@title')
     84         prices = selector.xpath('//div[@class="nhouse_price"]/span/text()')
     85         tels = selector.xpath('//div[@class="tel"]/p/text()')
     86         r = list(zip(links, addr, addrs, prices, tels))
     87         print(r)
     88         self.save_mysql(r)
     89 
     90     def save_data(self,get_dat):
     91         client = MongoClient('localhost', int(27017))  # 链接mongodb数据库,预留的接口可忽略·
     92 
     93     def log(self):
     94         logging.basicConfig(level=logging.DEBUG,
     95                             format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
     96                             datefmt='%a, %d %b %Y %H:%M:%S',
     97                             filename='myapp.log',
     98                             filemode='w'
     99         logging.debug('This is debug message')
    100         logging.info('This is info message')
    101         logging.warning('This is warning message')
    102 
    103 if __name__=="__main__":
    104     dt = Ft()
    105     gd = dt.get_data
    106     pool = threadpool.ThreadPool(50)
    107     reqs = threadpool.makeRequests(gd,range(2))
    108     [pool.putRequest(req) for req in reqs]
    109     pool.wait()
    110 
    111 下面附上建表代码:
    112 
    113 create table ftx(
    114 id int not null auto_increment,
    115 link varchar(100) not null,
    116 adr varchar(100) not null,
    117 adress varchar(100) not null,
    118 price varchar(100) not null,
    119 phone varchar(100) not null,
    120  PRIMARY KEY (id )
    121 );

    alter table ftx modify column price varchar(100) character set utf8 not null #修改字段的字符集

    SHOW CREATE DATABASE data;查看数据库字符集

    show full columns from ftx;查看数据表的字符集

    值得注意的是:在插入数据的时候,记得要相关字段的字符集变成utf8,否则会报错,最好刚开始建表就指定该表的字符集为utf8

  • 相关阅读:
    Vue 2.x windows环境下安装
    VSCODE官网下载缓慢或下载失败 解决办法
    angular cli 降级
    Win10 VS2019 设置 以管理员身份运行
    XSHELL 连接 阿里云ECS实例
    Chrome浏览器跨域设置
    DBeaver 执行 mysql 多条语句报错
    DBeaver 连接MySql 8.0 报错 Public Key Retrieval is not allowed
    DBeaver 连接MySql 8.0报错 Unable to load authentication plugin 'caching_sha2_password'
    Linux系统分区
  • 原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/7467866.html
Copyright © 2011-2022 走看看