import time,random
import urllib2,urllib,socket,re
from bs4 import BeautifulSoup
import cx_Oracle
conn = cx_Oracle.connect('xxx/xxx')
try:
cursor = conn.cursor()
cursor.execute('create table tb_user(id varchar2(50), name varchar2(50),password varchar(50))')
except:
print "wwwwwwww"
x = 0
my_dh = 0
def crawl(url):
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', }
req = urllib2.Request(url, headers=headers)
page = urllib2.urlopen(req, timeout=60)
contents = page.read()
soup = BeautifulSoup(contents, 'html.parser')
my_title = soup.select(".des h2 a")
file = open('E:Python\text.txt', 'a')
# for i,z in zip(my_title,my_title2):
# b = i.text.strip()
# d = z.text.strip()
# # w = c.text.strip()
# n = b+''+d
# print n
for phone in my_title:
time.sleep(random.random()*5)
url2 = phone['href']
html = urllib2.urlopen(url2).read()
soup2 = BeautifulSoup(html, 'html.parser')
my_dh = soup2.select('.phone-num')
if len(my_dh)>0:
my_dh1=my_dh[0].text
else:
my_dh1= 'null'
#continue
my_man = soup2.select('.c_000')
if len(my_man)>0:
my_man1 = soup2.select('.c_000')[0].text
my_bt = soup2.select('.c_333.f20')[0].text
my_money = soup2.select('.c_ff552e')[0].text
massage = url2 +' '+ my_man1+' '+my_dh1+' '+my_bt + my_money
print massage
param = {'id': url2, 'n': my_man1, 'p': my_dh1}
cursor.execute('insert into tb_user values(:id,:n,:p)', param)
conn.commit()
print param
file.write(massage.encode('utf-8') + ' ')
else:
continue
for page in range(1, 100):
page += 1
url = 'http://cc.58.com/chuzu/pn{}'.format(page)
crawl(url)
能够将部分网页数据提取出来形成TXT文档。导入数据库时是每提取一条信息便导入oracle数据库。而导入文档时,则是提取一定数量的数据才会进行一次导入。
其中还有一些小毛病需要改善。