zoukankan      html  css  js  c++  java
  • python爬虫

      1 #! /usr/bin/env python
      2 #coding=utf-8
      3 
      4 import requests
      5 import re,json
      6 import sys,os
      7 import Queue,threading
      8 from bs4 import BeautifulSoup
      9 reload(sys)
     10 sys.setdefaultencoding("utf8")
     11 
     12 def http_req_get(siteurl):
     13     headers = {
     14         "Host": "www.xuebang.com.cn",
     15         "Connection": "keep-alive",
     16         "Cache-Control": "max-age=0",
     17         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
     18         "Upgrade-Insecure-Requests": "1",
     19         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36",
     20         "Accept-Encoding": "gzip, deflate, sdch",
     21         "Accept-Language": "zh-CN,zh;q=0.8",
     22         "Cookie": "__cfduid=da7335f4b0e760976f98697b651fc10041447572288; pgv_pvi=7944819712; deptNumOf11=140; deptNumOf89=60; deptNumOf711=60; commentNumOf11215=1074; deptNumOf1411=56; JSESSIONID=abcqLyMOLKEVDbynTTtev; a2666_pages=1; a2666_times=4; pgv_si=s4040530944; Hm_lvt_8147cdaed425fa804276ea12cd523210=1447572289,1447678990,1447734730; Hm_lpvt_8147cdaed425fa804276ea12cd523210=1447734730; CNZZDATA5928106=cnzz_eid%3D1168227404-1447570407-%26ntime%3D1447729389; Hm_lvt_863e19f68502f1ae0f9af1286bb12475=1447572289,1447678990,1447734730; Hm_lpvt_863e19f68502f1ae0f9af1286bb12475=1447734730; _ga=GA1.3.122575526.1447572289; _gat=1"}
     23     try:
     24         urlobj = requests.get(
     25             url = siteurl,
     26             headers = headers,
     27         )
     28         return urlobj
     29     except Exception,e:
     30         print 'yichang'
     31         pass
     32 
     33 
     34 #def request_get(siteurl):
     35 
     36 class LinksParser(object):
     37     def __init__(self,urlobj):
     38         self.urlobj = urlobj
     39         self.soup = BeautifulSoup(self.urlobj.text, "html.parser")
     40 
     41     #创建大学目录
     42     def createDaXueDir(self):
     43         #获取当前文件路径 如果不存在就创建文件夹
     44         path = sys.argv[0]
     45         current_dir = os.path.dirname(path)
     46         real_path = current_dir + '\' + self.soup.title.text.encode('gb18030')
     47         if os.path.exists(real_path):
     48             pass
     49         else:
     50             try:
     51                 os.mkdir(real_path)
     52             except:
     53                 pass
     54         return real_path
     55 
     56     #获取学院 并把学校的院系写入文件
     57     def xueyuan(self,path):
     58         try:
     59             fh = open(real_path + '/xueyuan.txt','wb')
     60             for line in self.soup.find_all('a',{'class':'yxcologe'}):#.encode('gb18030')
     61                 fh.writelines(line.text.encode('gb18030').strip() + '
    ')
     62             fh.close()
     63         except:
     64             pass
     65 
     66     #获取该学院的教师列表的url
     67     def teacher(self,path):
     68         lst = []
     69         length = len(self.soup.find_all('a',{'class':'yxcologe'}))
     70         for i in xrange(length):
     71             #依次获取每个院系的老师链接
     72             url = self.soup.find_all('a',{'class':'yxcologe'})[i]['href'].encode('gb18030')
     73             lst.append(url)
     74         return lst
     75 
     76 
     77     #获取所有教师列表
     78     def teacher_lst(self):
     79         length = len(self.soup.find('span',{'class','TJszlist'}).find_all('li'))
     80         for i in xrange(length):
     81             lst = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['title']
     82             #开始截取该系的名称
     83             yuanxi = str(self.soup.find('span',{'class','t_dqwz'}))
     84             yuanxi = yuanxi[-40:]
     85             yuanxi = yuanxi.split('»')[1]
     86             yuanxi = yuanxi.split('<')[0]
     87             teacher_lst.append({'department':yuanxi,'name':lst})
     88             url = self.soup.find('span',{'class','TJszlist'}).find_all('li')[i].find('a')['href'].encode('gb18030')
     89             teacher_comment_url.append(url)
     90 
     91     #获取教师和该教师下所有评论
     92     def comment_teacher(self):
     93         length = len(self.soup.find_all('span',{'class','TJR_info'}))
     94         if length == 0:
     95             return 'no comments'
     96         else:
     97             for i in xrange(length):
     98                 teacher_content = self.soup.find_all('span',{'class','TJR_info'})[i].find('p',{'class','TJlycon'}).text
     99                 teacher_name = self.soup.find(color='#0088cc').text
    100                 teacher_time = self.soup.find_all('span',{'class','TJR_info'})[i].find('span').string
    101                 teacher_all_comment.append({'teacher_id':teacher_name,'comment':teacher_content,'time':teacher_time})
    102             json_data = json.dumps(teacher_all_comment, encoding="UTF-8", ensure_ascii=False)
    103             return json_data
    104 
    105 class myThreads(threading.Thread):
    106     def __init__(self,queue):
    107         threading.Thread.__init__(self)
    108         self.queue = queue
    109 
    110     def run(self):
    111         while True:
    112             if self.queue.empty():
    113                 break
    114             else:
    115                 try:
    116                     url = self.queue.get_nowait()
    117                     res_obj = LinksParser(http_req_get(url))
    118                     res_obj.teacher_lst()
    119                 except Exception,e:
    120                     break
    121 class commentThreads(threading.Thread):
    122     def __init__(self,queue):
    123         threading.Thread.__init__(self)
    124         self.queue = queue
    125 
    126     def run(self):
    127         while True:
    128             if self.queue.empty():
    129                 break
    130             else:
    131                 try:
    132                     url = self.queue.get_nowait()
    133                     res_obj = LinksParser(http_req_get(url))
    134                     test = res_obj.comment_teacher()
    135                     fh = open(real_path + '/teacher_comment_lst.txt','wb')
    136                     fh.write(test)
    137                     fh.close()
    138                 except Exception,e:
    139                     break
    140 
    141 if __name__ == '__main__':
    142     #输入学院的ID
    143     #i = sys.argv[1]
    144     idlist = [11, 129, 70, 71]
    145     for i in idlist:
    146         i = str(i)
    147         thread_number = 50
    148         url = 'http://www.xuebang.com.cn/' + i + '/deptlist'
    149         try:
    150             urlobj = http_req_get(url)
    151             #生成院系列表    
    152             response_obj = LinksParser(urlobj)
    153             #文件所保存的路径
    154             real_path = response_obj.createDaXueDir()
    155             response_obj.xueyuan(real_path)
    156             
    157             #获取该学院的教师列表的url 即每个系学院下面的教师
    158             xi_to_teacher = response_obj.teacher(real_path)
    159             #获取所有的教师列表
    160 
    161 
    162             global teacher_lst
    163             teacher_lst = []
    164 
    165 
    166             #教师comment链接
    167             global teacher_comment_url
    168             teacher_comment_url = []
    169 
    170             #多线程加快速度
    171             queue = Queue.Queue()
    172             for line in xi_to_teacher:
    173                 queue.put(line)
    174             threads = []
    175             for i in xrange(thread_number):
    176                 threads.append(myThreads(queue))
    177             for t in threads:
    178                 t.start()
    179             for t in threads:
    180                 t.join()
    181             teacher_lst = json.dumps(teacher_lst, encoding="UTF-8", ensure_ascii=False)
    182             #print len(teacher_lst)
    183 
    184             #将教师写入文件
    185             try:
    186                 fh = open(real_path + '/teacher_lst.txt','wb')
    187                 fh.write(teacher_lst)
    188                 fh.close()
    189             except:
    190                 pass
    191          
    192             global teacher_all_comment
    193             teacher_all_comment = []
    194             #获取教师及教师评论
    195             queu = Queue.Queue()
    196             for line_url in teacher_comment_url:
    197                 queu.put(line_url)
    198 
    199             comments = []
    200             for line in xrange(thread_number):
    201                 comments.append(commentThreads(queu))
    202             for t in comments:
    203                 t.start()
    204             for t in comments:
    205                 t.join()
    206         except:
    207             pass
    208 
    209         #print teacher_all_comment
  • 相关阅读:
    HTML5 在canvas中绘制复杂形状
    干货!最佳谈薪模板分享
    深度解析大学生“白菜价”现状
    【maven】maven 项目 deploy 报错:Failed to execute goal org.apache.maven.plugins:maven-deploy-plugin:2.7
    【maven】Maven插件篇 --- maven项目 mvn install 报错: Failed to execute goal on project 项目名: Could not resolve dependencies for project
    【Maven】maven项目 mvn clean install 或 build,报错:Failed to execute goal org.apache.maven.plugins:maven-surefire-plugin:2.12.4:test
    【IDEA】idea 新创建文件 增加 注释模板
    【IDEA】idea一直不停的scanning files to index 解决方法
    【mysql】extra 中的 FirstMatch()解读
    【linux】linux 中 > 和 >>的区别, > 是代替rm 的安全方式
  • 原文地址:https://www.cnblogs.com/jsq16/p/6018396.html
Copyright © 2011-2022 走看看