zoukankan      html  css  js  c++  java
  • Python之爬取网页的一个例子

    import time,random
    import urllib2,urllib,socket,re
    from bs4 import BeautifulSoup
    import cx_Oracle



    conn = cx_Oracle.connect('xxx/xxx')
    try:
    cursor = conn.cursor()
    cursor.execute('create table tb_user(id varchar2(50), name varchar2(50),password varchar(50))')
    except:
    print "wwwwwwww"
    x = 0
    my_dh = 0

    def crawl(url):

    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', }
    req = urllib2.Request(url, headers=headers)
    page = urllib2.urlopen(req, timeout=60)
    contents = page.read()

    soup = BeautifulSoup(contents, 'html.parser')
    my_title = soup.select(".des h2 a")
    file = open('E:Python\text.txt', 'a')

    # for i,z in zip(my_title,my_title2):
    # b = i.text.strip()
    # d = z.text.strip()
    # # w = c.text.strip()
    # n = b+''+d
    # print n


    for phone in my_title:
    time.sleep(random.random()*5)
    url2 = phone['href']
    html = urllib2.urlopen(url2).read()
    soup2 = BeautifulSoup(html, 'html.parser')
    my_dh = soup2.select('.phone-num')
    if len(my_dh)>0:
    my_dh1=my_dh[0].text
    else:
    my_dh1= 'null'
    #continue

    my_man = soup2.select('.c_000')
    if len(my_man)>0:
    my_man1 = soup2.select('.c_000')[0].text
    my_bt = soup2.select('.c_333.f20')[0].text
    my_money = soup2.select('.c_ff552e')[0].text
    massage = url2 +' '+ my_man1+' '+my_dh1+' '+my_bt + my_money

    print massage
    param = {'id': url2, 'n': my_man1, 'p': my_dh1}
    cursor.execute('insert into tb_user values(:id,:n,:p)', param)
    conn.commit()
    print param
    file.write(massage.encode('utf-8') + ' ')



    else:

    continue


    for page in range(1, 100):
    page += 1
    url = 'http://cc.58.com/chuzu/pn{}'.format(page)
    crawl(url)

    能够将部分网页数据提取出来形成TXT文档。导入数据库时是每提取一条信息便导入oracle数据库。而导入文档时,则是提取一定数量的数据才会进行一次导入。

    其中还有一些小毛病需要改善。

  • 相关阅读:
    错误 2 error C2059: 语法错误:“::”
    完全卸载session 所需要的函数
    header("Location:http://www.baidu.com");
    php str_pad() 用法
    php str_pad();
    设计模式系列-01-开篇
    博客园样式的设置系列-01-侧边栏和皮肤的设置
    vs20132015UML系列之-类图
    php获取当前时间和转换格式
    saltstack:multi-master configuration
  • 原文地址:https://www.cnblogs.com/cwmizlp/p/7009795.html
Copyright © 2011-2022 走看看