zoukankan      html  css  js  c++  java
  • Python 规范化LinkedIn用户联系人的职位名

    CODE:

    #!/usr/bin/python 
    # -*- coding: utf-8 -*-
    
    '''
    Created on 2014-8-19
    @author: guaguastd
    @name: job_title_standard.py
    '''
    
    import os
    import csv
    from collections import Counter
    from operator import itemgetter
    from prettytable import PrettyTable
    
    # specify csv directory
    CSV_FILE = os.path.join(r"E:", "\", "eclipse", "LinkedIn", "dfile", "my_connections.csv")
    
    # define a set of transforms that converts the first item
    # to the second item
    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
    ]
    
    csvReader = csv.DictReader(open(CSV_FILE), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]
    
    # Read in a list of titles and split 
    # apart any combined titles like "President/CEO."
    # "President & CEO", "President and CEO"
    titles = []
    for contact in contacts:
        titles.extend([t.strip() for t in contact['Job Title'].split('/')
                      if contact['Job Title'].strip() != ''])
    
    # Replace common/known abbreviations
    for i, _ in enumerate(titles):
        for transform in transforms:
            titles[i] = titles[i].replace(*transform)
    
    # Print out a table of titles sorted by frequency
    pt = PrettyTable(field_names=['Title', 'Freq'])
    pt.align = 'l'
    c = Counter(titles)
    [pt.add_row([title, freq])
    for (title, freq) in sorted(c.items(), key=itemgetter(1), reverse=True)
        if freq > 0]
    print pt
    
    # Print out a table of tokens sorted by frequency
    tokens = []
    for title in titles:
        tokens.extend([t.strip(',') for t in title.split()])
    pt = PrettyTable(field_names=['Token', 'Freq'])
    pt.align = 'l'
    c = Counter(tokens)
    [pt.add_row([token, freq])
    for (token, freq) in sorted(c.items(), key=itemgetter(1), reverse=True)
        if freq > 0 and len(token) > 2]
    print pt

    RESULT:

    +-----------------------------------+------+
    | Title                             | Freq |
    +-----------------------------------+------+
    | Senior Software Developer         | 1    |
    | Sales Manager                     | 1    |
    | Software Manager                  | 1    |
    | Online Marketing Manager          | 1    |
    | Senior Consultant                 | 1    |
    | Chief Executive Officer & Founder | 1    |
    | Director                          | 1    |
    | S                                 | 1    |
    | Student                           | 1    |
    | Senior Software Engineer          | 1    |
    | ???

    | 1 | +-----------------------------------+------+ +------------+------+ | Token | Freq | +------------+------+ | Manager | 3 | | Senior | 3 | | Software | 3 | | Marketing | 1 | | Founder | 1 | | Consultant | 1 | | Executive | 1 | | Sales | 1 | | Developer | 1 | | Director | 1 | | Chief | 1 | | Officer | 1 | | Student | 1 | | Online | 1 | | ???

    | 1 | | Engineer | 1 | +------------+------+



  • 相关阅读:
    谍战系列
    干将莫邪
    漫话安全众测
    一句话安全
    jsp一句话
    struts2的DevMode模式
    morse code
    Nessus的安装/激活/更新
    WinPcap4.13无法安装解决方法
    安全用网,你应该知道的事
  • 原文地址:https://www.cnblogs.com/lytwajue/p/7224304.html
Copyright © 2011-2022 走看看