zoukankan      html  css  js  c++  java
  • python google play


    #!/usr/env  python
    #-*- coding: utf-8  -*-
    import urllib 
    import urllib2 
    import random 
    import requests
    import os,sys 
    import MySQLdb
    from sgmllib import SGMLParser 
    from BeautifulSoup import BeautifulSoup
    import re
    num=0
    def main():
    	try:
    		conn=MySQLdb.connect(host='localhost',user='root',passwd='123456',db='googlemarket',charset="utf8")
    		conn.query("set names utf8")
    	except Exception,e:
    		print e
    		sys.exit()
    	cursor=conn.cursor() 
    	category=['PERSONALIZATION','TRANSPORTATION','SPORTS','HEALTH_AND_FITNESS','APP_WALLPAPER','COMICS','MEDICAL','BUSINESS','BOOKS_AND_REFERENCE','WEATHER','ENTERTAINMENT','MEDIA_AND_VIDEO','APP_WIDGETS','TOOLS','PHOTOGRAPHY','PRODUCTIVITY','EDUCATION','NEWS_AND_MAGAZINES','TRAVEL_AND_LOCAL','LIFESTYLE','SOCIAL','FINANCE','SHOPPING','LIBRARIES_AND_DEMO','COMMUNICATION','MUSIC_AND_AUDIO','GAME']	
    	for k in range(0,27):
    		t="https://play.google.com/store/apps/category/"+category[k]
    		html=requests.get(t)
    		preresult=html.content
    		soup=BeautifulSoup(preresult)
    		result=soup.prettify("utf-8")
    		pattern=re.compile('<a class="title" href="(.+?)" title')
    		dataresult=re.findall(pattern,result)
    		dataresult=list(set(dataresult))
    		for i in dataresult:
    			url="https://play.google.com"+i
    			print url		
    			#url="https://play.google.com/store/apps/details?id=com.androidesk&hl=zh_CNhttps%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fdetails%3Fid%3Dcom.androidesk"
    	
    			html=requests.get(url)
    			preresult=html.content
    			soup=BeautifulSoup(preresult)
    			result=soup.prettify("utf-8")
    			#名称
    			pattern=re.compile('<div class="document-title" itemprop="name">[sS]*?<div>([sS]*?)</div>')
    			data0=re.findall(pattern,result)
    			for items in data0:
    				print items
    			#制造商
    			pattern=re.compile('itemprop="name">([sS]*?)</a>')
    			data1=re.findall(pattern,result)
    		
    			make=data1[0].split("
    ")
    		
    			print make[8]
    			#版本
    			pattern=re.compile('itemprop="softwareVersion">([sS]*?)</div>')
    			data2=re.findall(pattern,result)
    			print data2[0]
    			#更新时间 
    			pattern=re.compile('itemprop="datePublished">([sS]*?)</div>')
    			data3=re.findall(pattern,result)
    			print data3[0]
    			#文件大小
    			pattern=re.compile('itemprop="fileSize">([sS]*?)</div>')
    			data4=re.findall(pattern,result)
    			print data4[0]
    			#支持固件
    			pattern=re.compile('itemprop="operatingSystems">([sS]*?)</div>')
    			data5=re.findall(pattern,result)
    			print data5[0]
    			#说明
    			pattern=re.compile('itemprop="description">[sS]*?<div>([sS]*?)</div>')
    			data6=re.findall(pattern,result)
    			for items in data6:
    				print re.sub('[<br /> <p> </p>]',' ',items)
    			sql="insert into address(name,version,developer,pubtime,filesize,support,introduction) values(%s,%s,%s,%s,%s,%s,%s)"
    			for items in data6:
    			
    				if(data5):
    					#values=(data0[0],data1[0],data2[0],data3[0],data4[0],data5[0],re.sub('<br />',' ',items))
    				#else:
    					#values=(data0[0],data1[0],data2[0],data3[0],data4[0],'NULL',re.sub('<br />',' ',items))
    				#print values
    				#print sql % values
    				#cursor.execute(sql,values)
    				#conn.commit()
    			pattern=re.compile('<img class="cover-image" src=(.+?) alt="Cover art" itemprop="image" />')
    			data=re.findall(pattern,result)
    			global num
    			for j in data:
    				print j
    				print type(j)
    				headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}
    				temp=requests.get(j[1:-2], headers=headers)
    			f=file("googlemarket/"+str(num),"w+")
    			num=num+1
    			print num
    			f.write(temp.content)
    	
    			
    		
    				
    	
        
    if  __name__=="__main__":
           main()
    




    <type 'str'>
    Traceback (most recent call last):
      File "crawler0729.py", line 103, in <module>
        main()
      File "crawler0729.py", line 91, in main
        temp=requests.get(j[1:-2], headers=headers)
      File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 55, in get
        return request('get', url, **kwargs)
      File "/usr/local/lib/python2.7/dist-packages/requests/api.py", line 44, in request
        return session.request(method=method, url=url, **kwargs)
      File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 335, in request
        resp = self.send(prep, **send_kwargs)
      File "/usr/local/lib/python2.7/dist-packages/requests/sessions.py", line 438, in send
        r = adapter.send(request, **kwargs)
      File "/usr/local/lib/python2.7/dist-packages/requests/adapters.py", line 327, in send
        raise ConnectionError(e)
    requests.exceptions.ConnectionError: HTTPSConnectionPool(host='lh3.ggpht.com', port=443): Max retries exceeded with url: /RBld17rLw4Ik0JtOaKk4bZB2RiGJ2R8H5Q8Rjw3Hh6BAM694fOzzKj1TJFr7R02ZS_40=w30 (Caused by <class 'socket.error'>: [Errno 101] Network is unreachable)


  • 相关阅读:
    bzoj2002: [Hnoi2010]Bounce 弹飞绵羊 [分块][LCT]
    luoguP1886 滑动窗口 [单调队列]
    bzoj1047: [HAOI2007]理想的正方形
    bzoj1012: [JSOI2008]最大数maxnumber [单调队列]
    树与二叉树之二--二叉树的性质与存储
    树与二叉树之一--基本概念与存储结构
    Markdown段首空格
    C++ atan2
    凸包学习笔记
    Codeforces Round #545 (Div. 1) E. Train Car Selection
  • 原文地址:https://www.cnblogs.com/javawebsoa/p/3228826.html
Copyright © 2011-2022 走看看