# encoding:utf-8 ''' Created on 2014年7月14日 @author: caoshouxin ''' import os import re import os.path from lxml import etree from sogou import offdb,docid import traceback import struct import logging as L from time import localtime,strftime L.basicConfig(level=L.INFO, format='[%(asctime)s] %(levelname)-8s %(message)s') filename="baike_soso_upload_20140717-160704.26523.xml" print "文件操作"+os.getcwd() class sosobaikeProcess(): def __init__(self,filename,ip="127.0.0.1",port="9999"): url_beg="http://baike.sogou.com/v" url_end=".htm " self.file_name=filename self.offdb_rand=offdb.QuickAdapter() self.offdb_rand.open(ip,port,5) now_time=strftime("%Y-%m-%d",localtime()) dir="" result_tup=self.getlemmaId_type() if result_tup is not None: (lemmaId,baike_type,value)=result_tup outputFile="sosobaike_"+now_time+"_"+baike_type outf=open(outputFile,'a') outf.write(url_beg+url_end) outf.close() self.put_qdb(lemmaId, value) def put_qdb(self,lemmaId,value): try: key=struct.pack('i',int(lemmaId)) ret=self.offdb_rand.put(key,value,0,5) if ret==0 or ret==1: L.info("put file %s/%s success %d"%(self.file_name,lemmaId,1)) else: self.offdb_reconnect(5,3) except Exception,e: L.error("put file %s/%s err %d because:%s"%(self.file_name,lemmaId,1,traceback.format_exc())) self.offdb_rand.close() def getlemmaId_type(self): lemmaId_obj=re.compile("<lemmaId>(.*?)</lemmaId>.*?<action>(.*?)</action>",re.M) lemma_obj=re.compile("<lemmaId>(.*?)</lemmaId>",re.M) lemmaId="" baike_type="" if os.path.isfile(self.file_name): f=open(self.file_name) #为节约内存和提高匹配速度,只读取文件的1024字节 filecontent=f.read(1024) f.close() m=re.search(lemmaId_obj, filecontent) if m is not None: lemmaId=m.group(1) baike_type=m.group(2) L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type)) return (lemmaId,baike_type) else: m_1=re.search(lemma_obj,filecontent) if m_1 is not None: lemmaId=m.group(1) baike_type="update" L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type)) return (lemmaId,baike_type) else: L.info("put file %s not found result"%(self.file_name)) print None else: L.info("put file%s not found"%(self.file_name)) return None