下面是源代码,在调试代码的过程中,发现用mysql存储特别慢,最好用mongodb或者redis,后面将会推出协程和线程搭配爬取数据
1 # -*- coding: utf-8 -*- 2 3 import requests,time,urllib.request,os,re,xlwt 4 import threading,random,threadpool 5 import pymongo,pymysql,logging 6 from multiprocessing import Process 7 from lxml import etree 8 from pymongo import MongoClient 9 import log 10 11 user_agent_list = [ 12 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" , 13 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 14 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 15 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 16 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 17 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 18 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 19 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 20 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 21 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 22 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 23 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 24 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 25 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 26 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 27 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 28 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 29 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 30 31 ] 32 url = 'http://newhouse.sz.fang.com/house/s/b911/?ctm=1.sz.xf_search.page.9' 33 workbook = xlwt.Workbook() 34 sheet = workbook.add_sheet("Sheet Name") 35 36 #sheet.write(0, 2, 'foobar')# row, column, value 37 38 #workbook.save("foobar.xls") 39 40 #links = re.findall('"((http|ftp)s?://.*?)"', str(html.text))#获取网站所有url的正则表达式 41 42 #client = MongoClient('localhost',int(27017))#链接数据库 43 44 class Ft(object): 45 def save_mysql(self,d_t): 46 for i in d_t: 47 for ii in i: 48 lk = str(i[0]) 49 ad = str(i[1]) 50 ade = str(i[2]) 51 pe = str(i[3]) 52 phe = str(i[4]) 53 conn = pymysql.connect(host='192.168.191.1', user='root', passwd='123456789', db='data', port=3306, 54 charset='utf8') 55 cur = conn.cursor() # 获取一个游标 56 sql = '''INSERT INTO ftx(link,adr,adress,price,phone)VALUES("%s","%s","%s","%s","%s")''' %(lk, ad, ade, pe, phe) 57 cur.execute(sql) 58 data = cur.fetchall() 59 cur.close() # 关闭游标 60 conn.commit() # 事务提交 61 conn.close() # 释放数据库资源 62 63 def get_data(self,url): 64 headers={} 65 addr = [] 66 url_2 = 'http://newhouse.gz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.gz.xf_search.page.6' 67 url_1 = 'http://newhouse.sz.fang.com/house/s/b9'+ str(url) + '/?ctm=1.sz.xf_search.page.9' 68 headers['User-Agent'] = random.choice(user_agent_list) 69 try: 70 html = requests.get(url_2, headers=headers) 71 html.encoding = 'gbk' 72 if html.status_code == 200: 73 log.kk('下载网页数据成功') 74 else: 75 print('下载失败!!!') 76 except requests.exceptions.ReadTimeout as e: 77 log.gg.kk(e) 78 selector = etree.HTML(str(html.text)) 79 links = selector.xpath('//div[@class="nlc_img"]/a/@href') 80 addrnames = selector.xpath('//div[@class="nlcd_name"]/a/text()') 81 for i in addrnames: 82 addr.append(i.strip()) 83 addrs = selector.xpath('//div[@class="address"]/a/@title') 84 prices = selector.xpath('//div[@class="nhouse_price"]/span/text()') 85 tels = selector.xpath('//div[@class="tel"]/p/text()') 86 r = list(zip(links, addr, addrs, prices, tels)) 87 print(r) 88 self.save_mysql(r) 89 90 def save_data(self,get_dat): 91 client = MongoClient('localhost', int(27017)) # 链接mongodb数据库,预留的接口可忽略· 92 93 def log(self): 94 logging.basicConfig(level=logging.DEBUG, 95 format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 96 datefmt='%a, %d %b %Y %H:%M:%S', 97 filename='myapp.log', 98 filemode='w' 99 logging.debug('This is debug message') 100 logging.info('This is info message') 101 logging.warning('This is warning message') 102 103 if __name__=="__main__": 104 dt = Ft() 105 gd = dt.get_data 106 pool = threadpool.ThreadPool(50) 107 reqs = threadpool.makeRequests(gd,range(2)) 108 [pool.putRequest(req) for req in reqs] 109 pool.wait() 110 111 下面附上建表代码: 112 113 create table ftx( 114 id int not null auto_increment, 115 link varchar(100) not null, 116 adr varchar(100) not null, 117 adress varchar(100) not null, 118 price varchar(100) not null, 119 phone varchar(100) not null, 120 PRIMARY KEY (id ) 121 );
alter table ftx modify column price varchar(100) character set utf8 not null #修改字段的字符集
SHOW CREATE DATABASE data;查看数据库字符集
show full columns from ftx;查看数据表的字符集
值得注意的是:在插入数据的时候,记得要相关字段的字符集变成utf8,否则会报错,最好刚开始建表就指定该表的字符集为utf8