#-*- coding: utf-8 -*- # search.py import urllib2 import json import threading #import copy from sgmllib import SGMLParser DETAIL_URL = "http://shixin.court.gov.cn/detail?id={}" DETAIL_KEYS = ["age", "sexy", "cardNum", "areaName", "courtName", "gistId", "regDate", "gistUnit", "duty", "performance", "disruptTypeName"] csv_file = open('search.csv', 'w') # 写入文件 锁 file_lock = threading.RLock() # 记录也写入的单位数 num = 0 class GetIdList(SGMLParser): def reset(self): self.all_data = [] self.IDlist = [] self.flag = False self.getdata = False SGMLParser.reset(self) def start_tr(self, attrs): for k,v in attrs:#遍历div的所有属性以及其值 # tr style="height:28px;" if k == 'style' and v == 'height:28px;':#确定进入了<div class='entry-content'> self.flag = True return def end_tr(self):#遇到</div> self.flag = False if self.IDlist: self.get_detail(self.IDlist[1]) def start_a(self, attrs): if self.getdata == True: for k,v in attrs: if k == 'id': self.IDlist.append(v) def start_td(self, attrs): if self.flag == False: return self.getdata = True def end_td(self):#遇到</p> if self.getdata: self.getdata = False def handle_data(self, text):#处理文本 if self.getdata: self.IDlist.append(text) def get_detail(self, pid): while True: print pid, self.IDlist[2] try: detail_msg = urllib2.urlopen(DETAIL_URL.format(pid)).read() except urllib2.HTTPError as e: continue break detail = json.loads(detail_msg) self.IDlist = self.IDlist[:-4] for item in DETAIL_KEYS: value = detail.get(item, '') self.IDlist.append(value) import sys reload(sys) sys.setdefaultencoding('utf-8') for index,item in enumerate(self.IDlist): self.IDlist[index] = str(item).decode('utf-8') self.all_data.append(','.join(self.IDlist)) self.IDlist = [] pass def print_data(self): if file_lock.acquire(): for i in self.all_data: i = i.replace(' ', '') print >> csv_file, i global num num += 1 print num file_lock.release() class MyThread(threading.Thread): def __init__(self, first_num, last_num): threading.Thread.__init__(self) self.first_num = first_num self.last_num = last_num def run(self): for i in range(self.first_num, self.last_num + 1): try_cnt = 0 while True: try: try_cnt += 1 content = urllib2.urlopen('http://shixin.court.gov.cn/personMore.do?currentPage={}'.format(i)).read() except urllib2.HTTPError as e: if try_cnt < 5: continue # 尝试5次都失败 跳过这个id的查询 break break # print content if content: t = GetIdList() t.feed(content) t.print_data() print "*********Pag finshed: ", i else: break # 不小于次数的总查询量 #all_search_num = 90000 all_search_num = 90 # 启用的线程数 all_thread_num = 9 gap = all_search_num / all_thread_num + 1 for thread_num in range(0, all_thread_num): this_thread = MyThread(thread_num * gap + 1, (thread_num + 1) * gap) this_thread.start()