zoukankan      html  css  js  c++  java
  • Python 爬58同城 城市租房信息

    爬取完会自动生成csv电子表格文件,含有房价、押付、链接等信息

    环境
    py2.7
    pip install lxml
    pip install cssselect
     
     1 #coding:utf-8
     2 import csv
     3 import urllib2
     4 import lxml.html
     5 import time
     6 import sys
     7 from lxml.cssselect import CSSSelector
     8 import threading
     9 reload(sys)
    10 sys.setdefaultencoding('utf8')
    11 
    12 print "请输入要爬取得城市简称例如bj(北京):"
    13 CITY=str(raw_input(">>>"))
    14 def download(url, user_agent='Google', num_retries=2):
    15 
    16     headers = {'User-agent': user_agent}
    17     request = urllib2.Request(url, headers=headers)
    18     try:
    19         html = urllib2.urlopen(request).read()
    20     except urllib2.URLError as e:
    21         html = None
    22         if num_retries > 0:
    23             if hasattr(e, 'code') and 500 <= e.code < 600:
    24                 return download(url, num_retries-1)
    25     return html
    26 
    27 
    28 def get_data(url):
    29     html_text_detail = download(url)
    30     try:
    31         tree = lxml.html.fromstring(html_text_detail)
    32         house_ext = CSSSelector('div.house-pay-way > span:nth-child(3)')
    33         house_title = CSSSelector('div.main-wrap > div.house-title > h1')
    34         house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)')
    35         house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)')
    36         print house_title(tree)[0].text_content()
    37         print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content())
    38 
    39         for i in range(7):
    40             for j in range(2):
    41                 css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1)
    42                 house_info = CSSSelector(css)
    43                 data = [
    44                 ('标题 : ',house_title(tree)[0].text_content(), '#',url),
    45                 ('价格: ',house_pay_way1(tree)[0].text_content(), '#'),
    46                 ('压付: ',house_pay_way2(tree)[0].text_content(), '#'),
    47                 ('详情: ',house_info(tree)[0].text_content().replace(' ', ''), '#')]
    48                 with open('%s_houses.csv'%CITY,'ab+') as csvfile:
    49                     writer = csv.writer(csvfile,lineterminator='
    ')
    50                     writer.writerows(data)
    51 
    52     except TypeError as e:
    53         pass
    54     except IndexError as e:
    55         pass
    56 
    57 def get_url(html):
    58     tree = lxml.html.fromstring(html)
    59     sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a')
    60     url_list = []
    61     for i in sel(tree):
    62         if i.get('href') not in url_list:
    63             url_list.append(i.get('href'))
    64     return url_list
    65 
    66 
    67 if __name__ == '__main__':
    68     url_index = 'http://%s.58.com/chuzu/'%CITY
    69     html_text_list = download(url_index)
    70     url_list = get_url(html_text_list)
    71 
    72     for url_detail in url_list:
    73         thr = threading.Thread(target=get_data, args=(url_detail,))
    74         thr.start()
    75 
    76         time.sleep(0.001)
    py58.py
    ------ 往事如烟,伴着远去的步伐而愈加朦胧。未来似雾,和着前进的风儿而逐渐清晰!
  • 相关阅读:
    视频流媒体服务器网络硬盘录像机NVR接入/解码/转发能力解析
    流媒体服务器安装失败/程序启动错误等问题解决方案
    监控摄像头如何用作网络直播?
    数据库之单表查询
    数据库之表与表之间的关系
    数据库之完整性约束
    数据库之数据类型
    数据库之增删改查操作
    数据库之基本操作和存储引擎
    数据库之数据库基础及安装
  • 原文地址:https://www.cnblogs.com/cutesnow/p/7161692.html
Copyright © 2011-2022 走看看