zoukankan      html  css  js  c++  java
  • 根据职位名,自动生成jd

    代码本身就是最好的解释,不赘述。

    文本聚类输出: cluster.py

    #!/usr/bin/env python
    # coding=utf-8
    
    import jieba,re
    from gensim import corpora,models
    from sklearn.cluster import KMeans
    import sys
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    
    class MyCorpus(object):
        def __init__(self,fname):
            self.fname = fname
    
        def __iter__(self):
            for line in open(self.fname):
                yield jieba.cut(line,cut_all=False)
    
    
    class MyCluster(object):
    
        def __init__(self):
            self.CLEAN = re.compile(ur"[^u4e00-u9f5aA-Za-z0-9]")
            self.dictionary = {}
            self.corpus = []
    
        
        def gen_dataset(self,documents):
            self.gen_corpus(documents)
            res = [self.doc2vec(doc) for doc in documents]
            return res
    
    
        def gen_corpus(self,documents):
            texts = [ list(jieba.cut(doc)) for doc in documents ]
            self.dictionary = corpora.Dictionary(texts)
            self.corpus = [self.dictionary.doc2bow(text) for text in texts]
            self.tfidf = models.TfidfModel(self.corpus)
    
    
        def doc2vec(self,doc):
            vec =  self.dictionary.doc2bow(jieba.cut(doc))
            vec = self.tfidf[vec]
            wordlist = [.0] * len(self.dictionary)
            for w in vec:
                wordlist[w[0]] = w[1]
            return wordlist
                
    
        def kcluster(self,texts,k=3):
            from random import shuffle
            data = self.gen_dataset(texts)
            data = [ map(lambda x:round(x,5),line) for line in data ]
            km = KMeans(n_clusters=k,init='k-means++',max_iter=200,n_init=1,verbose=True)
            km.fit(data)
            labels = km.labels_
            flag = [0]*len(labels)
            randomtext = zip(labels,texts)
            shuffle(randomtext)
            res = []
            for d in randomtext:
                if flag[d[0]]==0:
                    res.append(d[1])
                    flag[d[0]] = 1
    
            return res
    
    
    if __name__ == "__main__":
        texts = [ line for line in open('data/python.db') ]
        test = MyCluster()
        res = test.kcluster(texts,k=4)
    
        print '
    '.join(res)

    自动生成主文件: auto_gen_jd.py

    #!/usr/bin/env python
    # coding=utf-8
    
    import sys,os
    import simplejson as json
    import codecs
    # from snownlp import SnowNLP
    from simhash import Simhash
    # from bosonnlp import BosonNLP
    from cluster import MyCluster
    from jd_parser import JdParser
    import re
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    class AutoGenJD(object):
        ''' 自动生成JD,输入一个职位名 和句子数,输出一份岗位描述和要求 '''
    
        def __init__(self):
            self.CLEAR_NUM = re.compile(u"^d+[.、::]|^[((]d+[)).]?|ds*[))】]")
            self.CLEAR_COLO = re.compile(u"^[。.)(【】]S+|[.;:;。]$")
            self.jd_database = json.load(codecs.open('data/lagou_jd_clean.json'))
        #   self.jobname = [ jobname[:-3] for jobname in os.listdir("data") if jobname.endswith(".db") ]
            self.jobname = self.jd_database.keys()
        #   self.bosonnlp = BosonNLP('UYTG1Csb.3652.5pZ2otkIncEn')
            self.jdparser = JdParser()
            self.km = MyCluster()
    
        def load_json_data(self,fname="../preprocess/data/mini_jd.json",arg1=None,arg2=None):
            for line in codecs.open(fname):
                try:
                    data = json.loads(line)
                except Exception,e:
                    print e
                    continue
                if data.get(arg1,False) != False and data[arg1].has_key("job_title") and data[arg1].has_key("job_description"):
                    if len(data[arg1]["job_title"])<2 or len(data[arg1]["job_title"])>16:
                        continue
                    else:
                        fw = codecs.open('./data/'+data[arg1][arg2]+".txt",'w','utf-8')
                        fw.write(data[arg1]["job_description"].strip()+"
    
    ")
                        print "writing...",data[arg1][arg2]
        
        # 去除 序列号等清洗数据
        def clean_jd(self,fname="./data/java.txt"):
            clean_sents = set()
            with codecs.open(fname+".txt",'r','utf-8') as fr:
                for line in fr:
                    line = self.CLEAR_NUM.sub("",line.strip())
                    line = self.CLEAR_COLO.sub("",line.strip())
                    if len(line)>2:
                        clean_sents.add(line.strip())
            with codecs.open(fname[:-3]+"db",'w','utf-8') as fw:
                for line in clean_sents:
                    fw.write(line+'
    ')
            return clean_sents
       
        def is_most_english(self,line):
            en_word = [ uchar for uchar in line if (uchar>=u'u0041' and uchar<=u'u005a') or (uchar>=u'u0061' and uchar<=u'u007a') ]
            return float(len(en_word)*1.0/len(line))>0.7
    
        def clean_jd2(self,jdstr):
            """
            清洗数据,去除句子前后的标点符合,序号等杂乱数据
            """
            res = set()
            for line in jdstr.split("
    "):
                line = line.strip()
                if len(line)<12:
                    print "line",line
                if re.search(u"[;.;。]d+|d?[,,、::.]$|^ds{0,1}[u4e00-u9f5e]",line) or len(line)<8 or len(line)>32:continue
                if self.is_most_english(line):continue
                line = self.CLEAR_NUM.sub("",line)
                line = self.CLEAR_COLO.sub("",line)
                res.add(line)
            return res
            
    
        # 获取和用户输入相似度最近的职位名
        def get_closet_job(self,jobname="java"):
            dis = [ (other,Simhash(jobname).distance(Simhash(other))) for other in self.jobname ]
            sorteddis = sorted(dis,key = lambda x:x[1])
            for k,v in sorteddis[:5]:
                print k,v
            return sorteddis[0][0]
        
        # 规范化jd句子数目
        def norm_jd_num(self,num):
            if num<1:
                num=1
            elif num>20:
                num = 20
            return num
    
    
        # 根据职位名和句子数,获得jd
        def get_jd_with_snownlp(self,jobname="java",num=5):
            jobname = self.get_closet_job(jobname)
          #  with open("./data/"+jobname+".db") as fr:
          #      s = SnowNLP(fr.read())
          #      return s.summary(num)
            jdstr = self.clean_jd2(self.jd_database[jobname])
            s = SnowNLP(jdstr)
            return s.summary(num)
    
        def get_jd_with_bosonnlp(self,jobname="java",num=5):
    
            res = set()
            jobname = self.get_closet_job(jobname)
            jdstr = self.clean_jd2(self.jd_database[jobname])[:80]
            all_cluster = self.bosonnlp.cluster(jdstr)
            sort_all_cluster = sorted(all_cluster,key = lambda x:x['num'],reverse=True)
            for idx,cluster in enumerate(sort_all_cluster):
                print idx+1,cluster['_id']
                res.add(jdstr[cluster['_id']])
            return res
    
    
        def _get_sent_score(self,line):
            """
            句子得分,最后结果排序使用,分值越小,排序越靠前
            """
            s = len(line)+100
            if re.search(u"男|女|男女不限|性别|岁",line):
                s -= 60
            if re.search(u"学历|专业|d+[kK元]",line):
                s -= 40
            if re.search(u"经验",line):
                s -= 20
            return s
                
    
        def get_jd_with_kmeans(self,jobname='python',num=6):
            """
            使用kmeans 进行聚类,相同一类只出现一句
            """
            jobname = self.get_closet_job(jobname)
            jdstr = self.clean_jd2(self.jd_database[jobname])
            print "jdstr",len(jdstr)
            print self.jd_database[jobname]
    
            if len(jdstr)<int(num):
                num = len(jdstr)
            res = self.km.kcluster(jdstr,k=int(num))
            return sorted(res,cmp=lambda x,y:self._get_sent_score(x)-self._get_sent_score(y))
    
    
        def jd_parser(self,jdstr):
            result = self.jdparser.parser(jdstr) 
            return result
    
    if __name__ == "__main__":
    
        test = AutoGenJD()
        jobname = sys.argv[1]
        jdnum = int(sys.argv[2])
        print "job name:",jobname
        print "demand:"
        demand = test.get_jd_with_kmeans(jobname,jdnum)
        for i,jdstr in enumerate(demand):
            print "%d. %s" %(i+1,jdstr)
    每天一小步,人生一大步!Good luck~
  • 相关阅读:
    java操作生成jar包 和写入jar包
    jboss配置jndi连接池
    windows 域的LDAP查询相关举例
    LDAP error Code 及解决方法
    HDU 6417
    CF1299D Around the World
    codechef Chef and The Colored Grid
    Educational Codeforces Round 82 (Rated for Div. 2)
    CF1237F Balanced Domino Placements
    CF1254E Send Tree to Charlie
  • 原文地址:https://www.cnblogs.com/jkmiao/p/4874803.html
Copyright © 2011-2022 走看看