zoukankan      html  css  js  c++  java
  • 爬虫代码——做一下记录

    比较成功完善的爬取代码,装入数据库,去除重复数据,形成表格,产生数据的json串。

    # -*- coding:utf-8 -*-

    import urllib2, requests
    from bs4 import BeautifulSoup
    import socket,random
    from retrying import retry
    import sys
    import xlwt
    import cx_Oracle
    import json

    try:
    conn = cx_Oracle.connect('xxx/xxx')
    cursor = conn.cursor()
    cursor.execute('create table tb_user(url varchar2(250), name varchar2(250),introduce varchar(250),address varchar(250))')
    except:
    print "The table already exists, but please continue"

    x = 0
    my_dh = 0

    ippool = ['118.180.49.24:8080',
    '27.184.130.29:8888',
    '113.140.43.136:80',
    '60.169.19.66:9000',
    '60.21.206.165:9999']

    @retry
    def crawl(url):
    rip = random.choice(ippool)
    print rip
    s = requests.session()
    proxies = {
    'http': 'http://' + rip,
    'https': 'http://' + rip,
    }
    #print rip
    headers = {'User-Agent': 'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'}
    html = s.get(url, proxies=proxies,timeout=15, headers=headers,).text
    s.encoding = 'utf-8'
    soup = BeautifulSoup(html, 'html.parser')
    my_title = soup.select('.des h2 a')

    file = open('F:yjh2\xx.txt', 'a')

    for phone in my_title:
    url2 = phone['href']
    rip2 = random.choice(ippool)
    proxies = {
    'http': 'http://' + rip2,
    'https': 'http://' + rip2,
    }
    #print rip2
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
    html = s.get(url2, proxies=proxies, headers=headers,timeout=15).text
    s.encoding = 'utf-8'
    soup2 = BeautifulSoup(html, 'html.parser')
    my_dh = soup2.select('.phone-num')
    if len(my_dh) > 0:
    my_dh1 = my_dh[0].text
    else:
    my_dh1 = 'null'

    try:
    my_man_list = soup2.select('.c_000')
    my_man = soup2.select('.c_000')[0].text
    my_bt = soup2.select('.c_333.f20')[0].text
    my_money = soup2.select('.c_ff552e')[0].text
    my_dq = soup2.select('.f14 span a')[1].text

    if len(my_man_list) > 0:

    massage = url2 + ';' + my_man + ':' + my_dh1 + ';' + my_bt + my_money + ';' + my_dq
    param = {'id': url2, 'n': my_man+':'+ my_dh1,'p':my_bt + my_money,'m':my_dq}
    print massage
    cursor.execute('insert into tb_user values(:id,:n,:p,:m)', param)
    conn.commit()
    cursor.execute('delete from tb_user where'+'(url)'+ 'in (select url from tb_user group by url having count(url) >1)'+'and rowid not in (select min(rowid) from tb_user group by url having count(url)>1)')
    conn.commit()
    jsonData = []
    cursor.execute('select * from tb_user')
    i = 0
    wbk = xlwt.Workbook()
    sheet = wbk.add_sheet('foobar', cell_overwrite_ok=True)
    for row in cursor:
    result = {}
    result['url'] = row[0]
    result['name'] = row[1]
    result['jieshao'] = row[2]
    result['diqu'] = row[3]
    jsonData.append(result)
    sheet.write(i, 0, row[0])
    sheet.write(i, 1, row[1].decode('utf-8'))
    sheet.write(i, 2, row[2].decode('utf-8'))
    sheet.write(i, 3, row[3].decode('utf-8'))
    i = i + 1

    wbk.save("58.xls")
    jsondatar = json.dumps(jsonData, ensure_ascii=False, indent=4)
    # 对jsondata可以进行数组操作,但是对jsondatar不行

    file.write(massage.encode('utf-8') + ' ')
    else:
    print '空!'
    continue
    except IndexError, socket.error:
    print '!'
    pass



    for page in range(1, 30):
    page += 1
    url = 'http://cc.58.com/chuzu/pn{}'.format(page)
    crawl(url)
    print "下载完成"
















  • 相关阅读:
    1024 Calendar Game
    外存管理与文件系统(1)
    1008 Gnome Tetravex
    写给想用技术改变世界的年轻人by 沃兹
    为什么VS2008里没有Silverlight呢?
    QQ消息群发器实现原理及核心代码
    JavaScript检测多组Radio是否选择
    C#操作Excel(导入导出)
    String.Split 方法有6个重载函数:
    【原】Ext2.2学习系列:ExtJS与.NET结合开发实例全部Ext2.2示例索引贴 (共收录56篇文章)
  • 原文地址:https://www.cnblogs.com/cwmizlp/p/7116996.html
Copyright © 2011-2022 走看看