zoukankan      html  css  js  c++  java
  • python 爬虫的初步实践

    简介

    我的sisiter,想要我爬一些试题给她。有80套,她不想手工点,所以,我来了,比较简单的网站。所以没有费很大的力气。期间参考了一系列的网站,都没有记录下来。

    code

    #!/usr/bin/env python
    #coding=utf-8
    
    import pdfkit
    import time
    import requests
    import sys
    import urllib2
    import re
    
    def get_hiddenvalue(url):
    	request=urllib2.Request(url)
    	reponse=urllib2.urlopen(request)
    	resu=reponse.read()
    	VIEWSTATE=re.findall(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />',resu, re.I)
    	EVENTVALIDATION =re.findall(r'input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', resu,re.I)
    	return VIEWSTATE[0],EVENTVALIDATION[0]
    
    def get_hiddenvalue_string(myStr):
    	VIEWSTATE=re.findall(r'<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />',myStr, re.I)
    	EVENTVALIDATION =re.findall(r'input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="(.*?)" />', myStr,re.I)
    	return VIEWSTATE[0],EVENTVALIDATION[0]
    
    reload(sys)
    sys.setdefaultencoding( "utf-8" )
    data = {
    	'cid':'1',
    	'pid':'5'
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    }
    options = {
        'page-size': 'A4',
        'margin-top': '0mm',
        'margin-right': '0mm',
        'margin-bottom': '0mm',
        'margin-left': '0mm',
        # 'orientation':'Landscape',#横向
        'encoding': "UTF-8",
        'no-outline': None,
          # 'footer-right':'[page]' 设置页码
    }
    myMap=["UniversityPsychology","HigherEducationRegulations","HigherEducation","TeacherEthics"]
    confg = pdfkit.configuration(wkhtmltopdf='/usr/local/bin/wkhtmltopdf')
    for i in range(4):
    	for j in range(20):
    		data['cid'] = str(i+1)
    		data['pid'] = str(j+1)
    		url='http://zjzx.zjnu.edu.cn/test/Default.aspx?cid='+str(i+1)+'&pid='+str(j+1)
    		#response = requests.post(url=url,data=data,headers=headers)
    		myStr = myMap[i]+'_'+'exam'+str(j+1)+'.pdf'
    		#response.encoding = 'utf-8'
    		myStr1 = myMap[i]+'_'+'exam'+str(j+1)+'.html'
    
    		data['Button1'] = '提交并查看单选题答案'
    		data['drop1']= '1'
    		data['drop2']='1'
    		data['__VIEWSTATE'],data['__EVENTVALIDATION'] = get_hiddenvalue(url) # 请查阅__VIEWSTATE __EVENTVALIDATION 一个是把所有的按键信息编码上传,一个是加密用的
    		response = requests.post(url=url,data=data,headers=headers)
    		responseReturn = response.text
    		time.sleep(3)
    		del data['Button1']
    		data['__VIEWSTATE'],data['__EVENTVALIDATION']=get_hiddenvalue_string(responseReturn)
    		data['Button2']='提交并查看多选题答案'
    		response = requests.post(url=url,data=data,headers=headers)
    		responseReturn = response.text
    
    		time.sleep(3)
    		del data['Button2']
    		data['__VIEWSTATE'],data['__EVENTVALIDATION']=get_hiddenvalue_string(responseReturn)
    		data['Button3']='提交并查看判断题答案'
    		response = requests.post(url=url,data=data,headers=headers)
    		responseReturn = response.text
    		del data['Button3']
    		with open(myStr1,'a') as file:
    			file.write(responseReturn)
    				
    		with open(myStr1, 'r') as file:
    			answer = file.read()
    			pdfkit.from_string(answer, myStr,configuration=confg,options=options)
    		
    		#pdfkit.from_url(url, myStr,configuration=confg)
    
    
    Hope is a good thing,maybe the best of things,and no good thing ever dies.----------- Andy Dufresne
  • 相关阅读:
    MySQL百万级数据量分页查询方法及其优化
    Windows10内置Linux子系统初体验
    谈谈区块链(18):以太坊的UTXO
    永久告别mac屏幕涂层脱落
    Cloud Foundry中DEA启动应用实例时环境变量的使用
    jQuery 事件方法大全-超全的总结
    UVA12304-2D Geometry 110 in 1!
    Hbase总结(五)-hbase常识及habse适合什么场景
    Android笔记之 网络http通信
    Mac下安装Redis
  • 原文地址:https://www.cnblogs.com/eat-too-much/p/11655176.html
Copyright © 2011-2022 走看看