zoukankan      html  css  js  c++  java
  • 多线程爬取房天下数据,并且存储到mysql(增加爬取日志输出模块)

    下面是源代码,在调试代码的过程中,发现用mysql存储特别慢,最好用mongodb或者redis,后面将会推出协程和线程搭配爬取数据

      1 # -*- coding: utf-8 -*-
      2 
      3 import requests,time,urllib.request,os,re,xlwt
      4 import threading,random,threadpool
      5 import pymongo,pymysql,logging
      6 from multiprocessing import Process
      7 from lxml import etree
      8 from pymongo import MongoClient
      9 import log
     10  
     11 user_agent_list = [ 
     12           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" ,
     13           "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
     14           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
     15           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
     16           "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
     17           "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
     18           "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
     19           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     20           "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     21           "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
     22           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     23           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
     24           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     25           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     26           "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
     27           "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
     28           "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
     29           "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
     30 
     31       ]
     32 url = 'http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9'
     33 workbook = xlwt.Workbook()
     34 sheet = workbook.add_sheet("Sheet Name")
     35 
     36 #sheet.write(0, 2, 'foobar')# row, column, value
     37 
     38 #workbook.save("foobar.xls")
     39 
     40 #links = re.findall('"((http|ftp)s?://.*?)"', str(html.text))#获取网站所有url的正则表达式
     41 
     42 #client = MongoClient('localhost',int(27017))#链接数据库
     43 
     44 class Ft(object):
     45     def save_mysql(self,d_t):
     46         for i in d_t:
     47             for ii in i:
     48                     lk = str(i[0])
     49                     ad = str(i[1])
     50                     ade = str(i[2])
     51                     pe = str(i[3])
     52                     phe = str(i[4])
     53                     conn = pymysql.connect(host='192.168.191.1', user='root', passwd='123456789', db='data', port=3306,
     54                                            charset='utf8')
     55                     cur = conn.cursor()  # 获取一个游标
     56                     sql = '''INSERT INTO ftx(link,adr,adress,price,phone)VALUES("%s","%s","%s","%s","%s")''' %(lk, ad, ade, pe, phe)
     57                     cur.execute(sql)
     58                     data = cur.fetchall()
     59                     cur.close()  # 关闭游标
     60                     conn.commit()  # 事务提交
     61                     conn.close()  # 释放数据库资源
     62 
     63     def get_data(self,url):
     64         headers={}
     65         addr = []
     66         url_2 = 'http://newhouse.gz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.gz.xf_search.page.6'
     67         url_1 = 'http://newhouse.sz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.sz.xf_search.page.9'
     68         headers['User-Agent'] = random.choice(user_agent_list)
     69         try:
     70             html = requests.get(url_2, headers=headers)
     71             html.encoding = 'gbk'
     72             if html.status_code == 200:
     73                 log.kk('下载网页数据成功')
     74             else:
     75                 print('下载失败!!!')
     76         except requests.exceptions.ReadTimeout as e:
     77             log.gg.kk(e)
     78         selector = etree.HTML(str(html.text))
     79         links = selector.xpath('//div[@class="nlc_img"]/a/@href')
     80         addrnames = selector.xpath('//div[@class="nlcd_name"]/a/text()')
     81         for i in addrnames:
     82             addr.append(i.strip())
     83         addrs = selector.xpath('//div[@class="address"]/a/@title')
     84         prices = selector.xpath('//div[@class="nhouse_price"]/span/text()')
     85         tels = selector.xpath('//div[@class="tel"]/p/text()')
     86         r = list(zip(links, addr, addrs, prices, tels))
     87         print(r)
     88         self.save_mysql(r)
     89 
     90     def save_data(self,get_dat):
     91         client = MongoClient('localhost', int(27017))  # 链接mongodb数据库,预留的接口可忽略·
     92 
     93     def log(self):
     94         logging.basicConfig(level=logging.DEBUG,
     95                             format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
     96                             datefmt='%a, %d %b %Y %H:%M:%S',
     97                             filename='myapp.log',
     98                             filemode='w'
     99         logging.debug('This is debug message')
    100         logging.info('This is info message')
    101         logging.warning('This is warning message')
    102 
    103 if __name__=="__main__":
    104     dt = Ft()
    105     gd = dt.get_data
    106     pool = threadpool.ThreadPool(50)
    107     reqs = threadpool.makeRequests(gd,range(2))
    108     [pool.putRequest(req) for req in reqs]
    109     pool.wait()
    110 
    111 下面附上建表代码:
    112 
    113 create table ftx(
    114 id int not null auto_increment,
    115 link varchar(100) not null,
    116 adr varchar(100) not null,
    117 adress varchar(100) not null,
    118 price varchar(100) not null,
    119 phone varchar(100) not null,
    120  PRIMARY KEY (id )
    121 );

    alter table ftx modify column price varchar(100) character set utf8 not null #修改字段的字符集

    SHOW CREATE DATABASE data;查看数据库字符集

    show full columns from ftx;查看数据表的字符集

    值得注意的是:在插入数据的时候,记得要相关字段的字符集变成utf8,否则会报错,最好刚开始建表就指定该表的字符集为utf8

  • 相关阅读:
    每日日报
    每日日报
    java笔记
    每日日报
    每日日报
    每日日报
    查看当前mysql时区 并设置为北京时间
    springboot 指定配置文件启动, 区分开发和线上分支
    Js Contains方法
    vue $refs的基本用法
  • 原文地址:https://www.cnblogs.com/Huangsh2017Come-on/p/7467866.html
Copyright © 2011-2022 走看看