python爬取36大数据文件代码
# -*- coding:UTF-8 -*- import urllib2 import re import os import time from sgmllib import SGMLParser from pyquery import PyQuery as pq from lxml import etree import urllib import sys import httplib httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' reload(sys) sys.setdefaultencoding( "utf-8" ) #获取网页 def getHtml(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = unicode(response.read(),'utf-8') return html #保存网页 def saveHtml(filepath,html): file_dir = os.path.split(filepath)[0] if not os.path.isdir(file_dir): os.makedirs(file_dir) if os.path.exists(filepath): os.remove(filepath) file = open(filepath,'w') file.write(html) file.close() #读文件 def readHtml(filepath): f = open(filepath,'rb') return unicode(f.read(),'utf-8') #解析网页 def resolveHtml(filepath): d = pq(filename=filepath) return d def resolveBlog(content): d_cont = pq(content) #文章标题 title = d_cont('h2').text() #文章href href = d_cont('h2').find('a').attr('href') #文章ID id = href.split('/')[-1] #作者 时间 类别 au_tm_cat = d_cont('p').filter('.info').text() author = au_tm_cat.split()[0] date = au_tm_cat.split()[1] cat = au_tm_cat.split()[2] #内容 note = d_cont('p').filter('.note').text() blog = [id,title,href,author,date,cat,note] return blog url = 'http://www.36dsj.com/' url = 'http://www.91333.com/pk10/' url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563' url = 'http://www.36dsj.com/' #html = getHtml(url) def crawlpage(url): page = urllib2.urlopen(url) text = unicode(page.read(),"utf-8") d = pq(text) bloglist = [] for i in range(d('article').filter('.excerpt').length): content = d('article').filter('.excerpt').eq(i).html() a = resolveBlog(content) #print a[5] bloglist.append(a) return bloglist def crawler(url): article = crawlpage(url) for i in article: print i[0],i[1],i[2] html = getHtml(i[2]) htmlname = i[2].split('/')[-1] d = pq(html) s = d('article').html() saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.html',s) saveHtml('/etl/etldata/script/tmpdir/html/'+htmlname+'.title',i[1]) imgurl = d('img').attr('src') #imgname = i[2].split('/')[-1] #print imgname ir = imgurl.encode("utf-8") #print ir urllib.urlretrieve(ir,'/etl/etldata/script/tmpdir/image/'+htmlname+'.jpg') #saveHtml('/etl/etldata/input/html/'+i[0]+'.html',html) url_base='http://www.36dsj.com/page/' for i in range(1,5): url = url_base+str(i) print url try : crawler(url) except: continue #url = 'http://www.36dsj.com/page/1' #crawler(url)
Shell调用python爬取36大数据文章
#!/bin/bash alias dt='date +%Y-%m-%d" "%H:%M:%S' shopt -s expand_aliases pypath=/usr/local/python2.7/bin dir=/etl/etldata/script a=`echo $RANDOM` a=`ls ${dir}/tmpdir/image/ | wc -l` echo "ALL:${a}" num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d ",sum)}'` if [ ${num} -eq 0 ];then num=1 fi #num=`awk 'BEGIN{srand();sum=rand()*20;printf("%d ",sum)}'` echo ${num} #num=1 #dir=/etl/etldata/script $pypath/python /${dir}/tmpdir/crawler_36.py ls -ltr ${dir}/tmpdir/image/ | awk '{print $NF}' | sed 's/.jpg/''/g' > ${dir}/tmp #id=`cat ${dir}/tmp | sed -n "${num}p" | awk '{print $1}'` id=`cat ${dir}/tmp | head -${num} | awk '{print $1}' | tail -1` echo "`dt`:${id}" $pypath/python ${dir}/tmpdir/mail.py ${id}
python爬取网页
# -*- coding:UTF-8 -*- import urllib2 import re import os import time from sgmllib import SGMLParser from pyquery import PyQuery as pq from lxml import etree import urllib import sys import requests import json import chardet reload(sys) sys.setdefaultencoding( "utf-8" ) def crawlpage(url): page = urllib2.urlopen("http://www.36dsj.com/") text = unicode(page.read(),"utf-8") d = pq(text) bloglist = [] for i in range(d('article').filter('.excerpt').length): content = d('article').filter('.excerpt').eq(i).html() a = resolveBlog(content) #print a[5] bloglist.append(a) return bloglist #通过url获取网页 def getHtml(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = unicode(response.read(),'utf-8') return html def get(url): request = urllib2.Request(url) response = urllib2.urlopen(request) html = response.read() s1 = sys.getfilesystemencoding() s2 = chardet.detect(html)['encoding'] print s1,s2 #print html return html.decode(s2).encode(s1) #保存网页内容 def saveHtml(filepath,html): file_dir = os.path.split(filepath)[0] if not os.path.isdir(file_dir): os.makedirs(file_dir) if os.path.exists(filepath): os.remove(filepath) file = open(filepath,'a') file.write(html) file.close() #读网页文件 def readHtml(filepath): f = open(filepath,'rb') return unicode(f.read(),'utf-8') #解析网页 def resolveHtml(filepath): d = pq(filename=filepath) return d def resolveBlog(content): d_cont = pq(content) #文章标题 title = d_cont('h2').text() #文章href href = d_cont('h2').find('a').attr('href') #文章ID id = href.split('/')[-1] #作者 时间 类别 au_tm_cat = d_cont('p').filter('.info').text() author = au_tm_cat.split()[0] date = au_tm_cat.split()[1] cat = au_tm_cat.split()[2] #内容 note = d_cont('p').filter('.note').text() blog = [id,title,href,author,date,cat,note] return blog def GetHtml(url): page = urllib.urlopen(url) contex = page.read() return contex def GetData(cont): data = pq(cont) #日期: date_id = data('td').eq(0).text() code_id = data('td').eq(1).text() A = data('td').eq(2).text() s = date_id+' '+code_id+' '+A+' ' return s url = 'http://www.36dsj.com/' url = 'http://www.91333.com/pk10/' url = 'http://baidu.lecai.com/lottery/draw/view/557/622132?agentId=5563' url = 'http://www.36dsj.com/' url = 'http://www.weather.com.cn/weather1d/101240303.shtml#input' url = 'http://www.nmc.gov.cn/publish/forecast/AJX/wuyuan.html' url = 'http://php.weather.sina.com.cn/search.php?city=%E6%C4%D4%B4&dpc=1' #url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html' #url = 'http://hq.sinajs.cn/list=sh601006' def craw_pk10(date_id): url = 'http://baidu.lecai.com/lottery/draw/list/557?d='+date_id html =getHtml(url) d = pq(html) data=d('table').filter('#draw_list').find('tbody').find('tr') print data.length for i in range(data.length): s = GetData(data.eq(i)) a = open('/home/hadoop/python/PK10_20170610.csv','a') a.write(s) a.close() #saveHtml('/home/hadoop/python/2017-06-10.csv',s)