sklearn实战-乳腺癌细胞数据挖掘(博主亲自录制视频)
多线程采集,1秒搞定

# -*- coding: utf-8 -*-
"""
Created on Tue May 17 16:26:31 2016
采集下来excel文件小于2kb的有问题
@author: Administrator
"""
import requests,bs4,csv,time,random,os,threading
#存放所有二级网址
fileName='combinedFile.csv'
#存放二级网址目录
bad_urls=[]
site_hubei="http://china.guidechem.com/suppliers/list_catid-21_area-%E6%B9%96%E5%8C%97"
site_guangdong="http://china.guidechem.com/suppliers/list_catid-21_area-广东"
site_shanghai="http://china.guidechem.com/suppliers/list_catid-21_area-%E4%B8%8A%E6%B5%B7"
site_shanxi="http://china.guidechem.com/suppliers/list_catid-21_area-陕西"
pages_hubei=31
pages_guangdong=21
pages_shanghai=34
pages_shanxi=15
start_page=0
def Get_sites(site,pages):
list_pages=[]
for page in range(1,pages+1):
thePage=site+"-"+"p"+str(page)+".html"
list_pages.append(thePage)
return list_pages
def Get_company_name(elems,i):
elems_company_name=elems[i].select(".dblue")
if len(elems_company_name)==0: #如果找不到元素,则空起
company_name=""
return company_name
company_name=elems_company_name[0].text
return company_name
def Get_main_product(elems,i):
elems_main_product=elems[i].select("li")
if len(elems_main_product)==0: #如果找不到元素,则空起
main_product=""
return main_product
main_product=elems_main_product[1].text.strip("
")
return main_product
def Get_phone_address(elems,i):
elems_contact=elems[i].select(".site_l")
content_contact=elems_contact[0].text
content_contact1=content_contact.strip("
")
content_contact2=content_contact1.strip("
")
list_content_contact=content_contact2.split("
")
#有时候信息会缺失,用正则表达式筛选text内容
if len(list_content_contact)==2:
phone=list_content_contact[0]
address=list_content_contact[1]
if len(list_content_contact)==1:
content=list_content_contact[0]
if "地址" in content:
address=content
phone=""
if "电话" in content:
phone=content
address=""
phone_address=(phone,address)
return phone_address
#获取每一页20个公司信息存储在list_rows_information
def Get_page_information(url):
#每一页20个公司信息存储在list_rows_information里面
list_rows_information=[]
res=requests.get(url)
time.sleep(2)
soup=bs4.BeautifulSoup(res.text,"lxml")
time.sleep(2)
#综合信息
elems=soup.select(".clist_list_content_r")
num=len(elems)
for i in range(num):
try:
#公司名称
company_name=Get_company_name(elems,i)
#主要产品
main_product=Get_main_product(elems,i)
#联系方式
phone_address=Get_phone_address(elems,i)
phone=phone_address[0]
address=phone_address[1]
list_rows_information.append([company_name,main_product,phone,address])
except:
print("error at:",i)
continue
return list_rows_information
#把一页内容写入csv文档 ,list_tableContent为二维列表[[a],[b],[c]]
def Write_table_to_csv(url):
list_tableContent=Get_page_information(url)
fileName=os.path.splitext(url)[0][-3:]+".csv" # 1.csv
'''
fileName=os.path.splitext(url)[0][-3:]+".csv"
fileName
Out[27]: 'p12.csv'
'''
#对列表格式修改,字符串写入的格式不对
file=open(fileName,'w',newline='')
writer1=csv.writer(file)
writer1.writerows(list_tableContent)
file.close()
#写入所有文件
def Write_allTables_to_csvs(list_pages):
for i in range(start_page,pages_shanghai):
try:
Write_table_to_csv(i)
time.sleep(random.randint(30,31))
except:
print("error at:",i)
continue
#获取截取数
def Step(urls_list):
step=len(urls_list)/15.0
step=int(round(step,0))
return step
#采集某范围网址的公司数据
def download_range(start,end):
urls_list_range1=list_pages[start:end]
for url in urls_list_range1:
try:
Write_table_to_csv(url)
except:
bad_urls.append(url)
continue
#print("well Done")
#主函数
list_pages=Get_sites(site_shanxi,pages_shanxi)
step=Step(list_pages)
#生产所有csv文件,单线程采集
#Write_allTables_to_csvs(list_pages)
downloadThreads = [] # a list of all the Thread objects
for i in range(0, len(list_pages), step): # loops 14 times, creates 14 threads
downloadThread = threading.Thread(target=download_range, args=(i, i +step))
downloadThreads.append(downloadThread)
downloadThread.start()
# Wait for all threads to end.
for downloadThread in downloadThreads:
downloadThread.join()
print('Done.')
'''
测试
#downloadThread = threading.Thread(target=download_range, args=(10, 12))
#downloadThread.start()
downloadThread = threading.Thread(target=download_range, args=(12, 14))
downloadThread.start()
downloadThread = threading.Thread(target=download_range, args=(14, 16))
downloadThread.start()
i=3
res=requests.get(list_pages[i])
soup=bs4.BeautifulSoup(res.text,"lxml")
elems=soup.select(".clist_list_content_r")
#联系方式
elems_contact=elems[2].select(".site_l")
content_contact=elems_contact[0].text
content_contact1=content_contact.strip("
")
content_contact2=content_contact1.strip("
")
list_content_contact=content_contact2.split("
")
#有时候信息会缺失,用正则表达式筛选text内容
if len(list_content_contact)==2:
phone=list_content_contact[0]
address=list_content_contact[1]
if len(list_content_contact)==1:
content=list_content_contact[0]
if "地址" in content:
address=content
phone=[]
if "电话" in content:
phone=content
address=[]
'''

