zoukankan      html  css  js  c++  java
  • python实例

    # encoding:utf-8
    '''
    Created on 2014年7月14日
    
    @author: caoshouxin
    '''
    import os
    import re
    import os.path
    from lxml import etree
    from sogou import offdb,docid
    import traceback
    import struct
    import logging as L
    from time import localtime,strftime
    L.basicConfig(level=L.INFO, format='[%(asctime)s] %(levelname)-8s %(message)s')
        
    filename="baike_soso_upload_20140717-160704.26523.xml"
    print "文件操作"+os.getcwd()
    
    class sosobaikeProcess():
        def __init__(self,filename,ip="127.0.0.1",port="9999"):
            url_beg="http://baike.sogou.com/v"
            url_end=".htm
    "
            self.file_name=filename
            self.offdb_rand=offdb.QuickAdapter()
            self.offdb_rand.open(ip,port,5)
            now_time=strftime("%Y-%m-%d",localtime())
            dir=""
            result_tup=self.getlemmaId_type()
            if result_tup is not None:
                (lemmaId,baike_type,value)=result_tup
                outputFile="sosobaike_"+now_time+"_"+baike_type
                outf=open(outputFile,'a')
                outf.write(url_beg+url_end)
                outf.close()
                self.put_qdb(lemmaId, value)
        def put_qdb(self,lemmaId,value):
            try:
                key=struct.pack('i',int(lemmaId))
                ret=self.offdb_rand.put(key,value,0,5)
                if ret==0 or ret==1:
                    L.info("put file %s/%s success %d"%(self.file_name,lemmaId,1))
                else:
                    self.offdb_reconnect(5,3)
            except Exception,e:
                L.error("put file %s/%s err %d because:%s"%(self.file_name,lemmaId,1,traceback.format_exc())) 
            self.offdb_rand.close()
        def getlemmaId_type(self):
            lemmaId_obj=re.compile("<lemmaId>(.*?)</lemmaId>.*?<action>(.*?)</action>",re.M)
            lemma_obj=re.compile("<lemmaId>(.*?)</lemmaId>",re.M)
            lemmaId=""
            baike_type=""
            if os.path.isfile(self.file_name):
                f=open(self.file_name)
                #为节约内存和提高匹配速度,只读取文件的1024字节
                filecontent=f.read(1024)
                f.close()
                m=re.search(lemmaId_obj, filecontent)
                if m is not None:
                    lemmaId=m.group(1)
                    baike_type=m.group(2)
                    L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type))
                    return (lemmaId,baike_type)
                else:
                    m_1=re.search(lemma_obj,filecontent)
                    if m_1 is not None:
                        lemmaId=m.group(1)
                        baike_type="update"
                        L.info("put file %s lemmaId=%s, type=%s "%(self.file_name,lemmaId,baike_type))
                        return (lemmaId,baike_type)
                    else:
                        L.info("put file %s not found result"%(self.file_name))
                        print None
            else:
                L.info("put file%s not found"%(self.file_name))
                return None
  • 相关阅读:
    pandas 数据结构基础与转换
    Python 基础常用
    css 横向滚动条webkit-scrollbar
    hive mysql 初始化
    hive 的理解
    hive 踩坑
    hbase 调试各种报错
    hbase shell常用命令
    mysql 性能测试工具 mysqlslap
    【CDH学习之一】CDH简介
  • 原文地址:https://www.cnblogs.com/csxf/p/3865682.html
Copyright © 2011-2022 走看看