zoukankan      html  css  js  c++  java
  • Coursera课程笔记----P4E.Capstone----Week 4&5

    Spidering and Modeling Email Data(week4&5)

    Mailing List - Gmane

    • Crawl the archive of a mailing list
    • Do some analysis / cleanup
    • Visualize the data as word cloud and lines

    code segment

    gmane.py

    import sqlite3
    import time
    import ssl
    import urllib.request, urllib.parse, urllib.error
    from urllib.parse import urljoin
    from urllib.parse import urlparse
    import re
    from datetime import datetime, timedelta
    
    # Not all systems have this so conditionally define parser
    try:
        import dateutil.parser as parser
    except:
        pass
    
    def parsemaildate(md) :
        # See if we have dateutil
        try:
            pdate = parser.parse(tdate)
            test_at = pdate.isoformat()
            return test_at
        except:
            pass
    
        # Non-dateutil version - we try our best
    
        pieces = md.split()
        notz = " ".join(pieces[:4]).strip()
    
        # Try a bunch of format variations - strptime() is *lame*
        dnotz = None
        for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
            '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
            '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
            try:
                dnotz = datetime.strptime(notz, form)
                break
            except:
                continue
    
        if dnotz is None :
            # print 'Bad Date:',md
            return None
    
        iso = dnotz.isoformat()
    
        tz = "+0000"
        try:
            tz = pieces[4]
            ival = int(tz) # Only want numeric timezone values
            if tz == '-0000' : tz = '+0000'
            tzh = tz[:3]
            tzm = tz[3:]
            tz = tzh+":"+tzm
        except:
            pass
    
        return iso+tz
    
    # Ignore SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE
    
    conn = sqlite3.connect('content.sqlite')
    cur = conn.cursor()
    
    baseurl = "http://mbox.dr-chuck.net/sakai.devel/"
    
    cur.execute('''CREATE TABLE IF NOT EXISTS Messages
        (id INTEGER UNIQUE, email TEXT, sent_at TEXT,
         subject TEXT, headers TEXT, body TEXT)''')
    
    # Pick up where we left off
    start = None
    cur.execute('SELECT max(id) FROM Messages' )
    try:
        row = cur.fetchone()
        if row is None :
            start = 0
        else:
            start = row[0]
    except:
        start = 0
    
    if start is None : start = 0
    
    many = 0
    count = 0
    fail = 0
    while True:
        if ( many < 1 ) :
            sval = input('How many messages:')
            if ( len(sval) < 1 ) : break
            many = int(sval)
    
        start = start + 1
        cur.execute('SELECT id FROM Messages WHERE id=?', (start,) )
        try:
            row = cur.fetchone()
            if row is not None : continue
        except:
            row = None
    
        many = many - 1
        url = baseurl + str(start) + '/' + str(start + 1)
    
        text = "None"
        try:
            # Open with a timeout of 30 seconds
            document = urllib.request.urlopen(url, None, 30, context=ctx)
            text = document.read().decode()
            if document.getcode() != 200 :
                print("Error code=",document.getcode(), url)
                break
        except KeyboardInterrupt:
            print('')
            print('Program interrupted by user...')
            break
        except Exception as e:
            print("Unable to retrieve or parse page",url)
            print("Error",e)
            fail = fail + 1
            if fail > 5 : break
            continue
    
        print(url,len(text))
        count = count + 1
    
        if not text.startswith("From "):
            print(text)
            print("Did not find From ")
            fail = fail + 1
            if fail > 5 : break
            continue
    
        pos = text.find("
    
    ")
        if pos > 0 :
            hdr = text[:pos]
            body = text[pos+2:]
        else:
            print(text)
            print("Could not find break between headers and body")
            fail = fail + 1
            if fail > 5 : break
            continue
    
        email = None
        x = re.findall('
    From: .* <(S+@S+)>
    ', hdr)
        if len(x) == 1 :
            email = x[0];
            email = email.strip().lower()
            email = email.replace("<","")
        else:
            x = re.findall('
    From: (S+@S+)
    ', hdr)
            if len(x) == 1 :
                email = x[0];
                email = email.strip().lower()
                email = email.replace("<","")
    
        date = None
        y = re.findall('Date: .*, (.*)
    ', hdr)
        if len(y) == 1 :
            tdate = y[0]
            tdate = tdate[:26]
            try:
                sent_at = parsemaildate(tdate)
            except:
                print(text)
                print("Parse fail",tdate)
                fail = fail + 1
                if fail > 5 : break
                continue
    
        subject = None
        z = re.findall('Subject: (.*)
    ', hdr)
        if len(z) == 1 : subject = z[0].strip().lower();
    
        # Reset the fail counter
        fail = 0
        print("   ",email,sent_at,subject)
        cur.execute('''INSERT OR IGNORE INTO Messages (id, email, sent_at, subject, headers, body)
            VALUES ( ?, ?, ?, ?, ?, ? )''', ( start, email, sent_at, subject, hdr, body))
        if count % 50 == 0 : conn.commit()
        if count % 100 == 0 : time.sleep(1)
    
    conn.commit()
    cur.close()
    

    gmodel.py

    import sqlite3
    import time
    import re
    import zlib
    from datetime import datetime, timedelta
    
    # Not all systems have this
    try:
        import dateutil.parser as parser
    except:
        pass
    
    dnsmapping = dict()
    mapping = dict()
    
    def fixsender(sender,allsenders=None) :
        global dnsmapping
        global mapping
        if sender is None : return None
        sender = sender.strip().lower()
        sender = sender.replace('<','').replace('>','')
    
        # Check if we have a hacked gmane.org from address
        if allsenders is not None and sender.endswith('gmane.org') :
            pieces = sender.split('-')
            realsender = None
            for s in allsenders:
                if s.startswith(pieces[0]) :
                    realsender = sender
                    sender = s
                    # print(realsender, sender)
                    break
            if realsender is None :
                for s in mapping:
                    if s.startswith(pieces[0]) :
                        realsender = sender
                        sender = mapping[s]
                        # print(realsender, sender)
                        break
            if realsender is None : sender = pieces[0]
    
        mpieces = sender.split("@")
        if len(mpieces) != 2 : return sender
        dns = mpieces[1]
        x = dns
        pieces = dns.split(".")
        if dns.endswith(".edu") or dns.endswith(".com") or dns.endswith(".org") or dns.endswith(".net") :
            dns = ".".join(pieces[-2:])
        else:
            dns = ".".join(pieces[-3:])
        # if dns != x : print(x,dns)
        # if dns != dnsmapping.get(dns,dns) : print(dns,dnsmapping.get(dns,dns))
        dns = dnsmapping.get(dns,dns)
        return mpieces[0] + '@' + dns
    
    def parsemaildate(md) :
        # See if we have dateutil
        try:
            pdate = parser.parse(md)
            test_at = pdate.isoformat()
            return test_at
        except:
            pass
    
        # Non-dateutil version - we try our best
    
        pieces = md.split()
        notz = " ".join(pieces[:4]).strip()
    
        # Try a bunch of format variations - strptime() is *lame*
        dnotz = None
        for form in [ '%d %b %Y %H:%M:%S', '%d %b %Y %H:%M:%S',
            '%d %b %Y %H:%M', '%d %b %Y %H:%M', '%d %b %y %H:%M:%S',
            '%d %b %y %H:%M:%S', '%d %b %y %H:%M', '%d %b %y %H:%M' ] :
            try:
                dnotz = datetime.strptime(notz, form)
                break
            except:
                continue
    
        if dnotz is None :
            # print('Bad Date:',md)
            return None
    
        iso = dnotz.isoformat()
    
        tz = "+0000"
        try:
            tz = pieces[4]
            ival = int(tz) # Only want numeric timezone values
            if tz == '-0000' : tz = '+0000'
            tzh = tz[:3]
            tzm = tz[3:]
            tz = tzh+":"+tzm
        except:
            pass
    
        return iso+tz
    
    # Parse out the info...
    def parseheader(hdr, allsenders=None):
        if hdr is None or len(hdr) < 1 : return None
        sender = None
        x = re.findall('
    From: .* <(S+@S+)>
    ', hdr)
        if len(x) >= 1 :
            sender = x[0]
        else:
            x = re.findall('
    From: (S+@S+)
    ', hdr)
            if len(x) >= 1 :
                sender = x[0]
    
        # normalize the domain name of Email addresses
        sender = fixsender(sender, allsenders)
    
        date = None
        y = re.findall('
    Date: .*, (.*)
    ', hdr)
        sent_at = None
        if len(y) >= 1 :
            tdate = y[0]
            tdate = tdate[:26]
            try:
                sent_at = parsemaildate(tdate)
            except Exception as e:
                # print('Date ignored ',tdate, e)
                return None
    
        subject = None
        z = re.findall('
    Subject: (.*)
    ', hdr)
        if len(z) >= 1 : subject = z[0].strip().lower()
    
        guid = None
        z = re.findall('
    Message-ID: (.*)
    ', hdr)
        if len(z) >= 1 : guid = z[0].strip().lower()
    
        if sender is None or sent_at is None or subject is None or guid is None :
            return None
        return (guid, sender, subject, sent_at)
    
    conn = sqlite3.connect('index.sqlite')
    cur = conn.cursor()
    
    cur.execute('''DROP TABLE IF EXISTS Messages ''')
    cur.execute('''DROP TABLE IF EXISTS Senders ''')
    cur.execute('''DROP TABLE IF EXISTS Subjects ''')
    cur.execute('''DROP TABLE IF EXISTS Replies ''')
    
    cur.execute('''CREATE TABLE IF NOT EXISTS Messages
        (id INTEGER PRIMARY KEY, guid TEXT UNIQUE, sent_at INTEGER,
         sender_id INTEGER, subject_id INTEGER,
         headers BLOB, body BLOB)''')
    cur.execute('''CREATE TABLE IF NOT EXISTS Senders
        (id INTEGER PRIMARY KEY, sender TEXT UNIQUE)''')
    cur.execute('''CREATE TABLE IF NOT EXISTS Subjects
        (id INTEGER PRIMARY KEY, subject TEXT UNIQUE)''')
    cur.execute('''CREATE TABLE IF NOT EXISTS Replies
        (from_id INTEGER, to_id INTEGER)''')
    
    conn_1 = sqlite3.connect('mapping.sqlite')
    cur_1 = conn_1.cursor()
    
    cur_1.execute('''SELECT old,new FROM DNSMapping''')
    for message_row in cur_1 :
        dnsmapping[message_row[0].strip().lower()] = message_row[1].strip().lower()
    
    mapping = dict()
    cur_1.execute('''SELECT old,new FROM Mapping''')
    for message_row in cur_1 :
        old = fixsender(message_row[0])
        new = fixsender(message_row[1])
        mapping[old] = fixsender(new)
    
    # Done with mapping.sqlite
    conn_1.close()
    
    # Open the main content (Read only)
    conn_1 = sqlite3.connect('file:content.sqlite?mode=ro', uri=True)
    cur_1 = conn_1.cursor()
    
    allsenders = list()
    cur_1.execute('''SELECT email FROM Messages''')
    for message_row in cur_1 :
        sender = fixsender(message_row[0])
        if sender is None : continue
        if 'gmane.org' in sender : continue
        if sender in allsenders: continue
        allsenders.append(sender)
    
    print("Loaded allsenders",len(allsenders),"and mapping",len(mapping),"dns mapping",len(dnsmapping))
    
    cur_1.execute('''SELECT headers, body, sent_at
        FROM Messages ORDER BY sent_at''')
    
    senders = dict()
    subjects = dict()
    guids = dict()
    
    count = 0
    
    for message_row in cur_1 :
        hdr = message_row[0]
        parsed = parseheader(hdr, allsenders)
        if parsed is None: continue
        (guid, sender, subject, sent_at) = parsed
    
        # Apply the sender mapping
        sender = mapping.get(sender,sender)
    
        count = count + 1
        if count % 250 == 1 : print(count,sent_at, sender)
        # print(guid, sender, subject, sent_at)
    
        if 'gmane.org' in sender:
            print("Error in sender ===", sender)
    
        sender_id = senders.get(sender,None)
        subject_id = subjects.get(subject,None)
        guid_id = guids.get(guid,None)
    
        if sender_id is None :
            cur.execute('INSERT OR IGNORE INTO Senders (sender) VALUES ( ? )', ( sender, ) )
            conn.commit()
            cur.execute('SELECT id FROM Senders WHERE sender=? LIMIT 1', ( sender, ))
            try:
                row = cur.fetchone()
                sender_id = row[0]
                senders[sender] = sender_id
            except:
                print('Could not retrieve sender id',sender)
                break
        if subject_id is None :
            cur.execute('INSERT OR IGNORE INTO Subjects (subject) VALUES ( ? )', ( subject, ) )
            conn.commit()
            cur.execute('SELECT id FROM Subjects WHERE subject=? LIMIT 1', ( subject, ))
            try:
                row = cur.fetchone()
                subject_id = row[0]
                subjects[subject] = subject_id
            except:
                print('Could not retrieve subject id',subject)
                break
        # print(sender_id, subject_id)
        cur.execute('INSERT OR IGNORE INTO Messages (guid,sender_id,subject_id,sent_at,headers,body) VALUES ( ?,?,?,datetime(?),?,? )',
                ( guid, sender_id, subject_id, sent_at,
                zlib.compress(message_row[0].encode()), zlib.compress(message_row[1].encode())) )
        conn.commit()
        cur.execute('SELECT id FROM Messages WHERE guid=? LIMIT 1', ( guid, ))
        try:
            row = cur.fetchone()
            message_id = row[0]
            guids[guid] = message_id
        except:
            print('Could not retrieve guid id',guid)
            break
    
    cur.close()
    cur_1.close()
    

    gbasic.py

    import sqlite3
    import time
    import zlib
    
    howmany = int(input("How many to dump? "))
    
    conn = sqlite3.connect('index.sqlite')
    cur = conn.cursor()
    
    cur.execute('SELECT id, sender FROM Senders')
    senders = dict()
    for message_row in cur :
        senders[message_row[0]] = message_row[1]
    
    cur.execute('SELECT id, subject FROM Subjects')
    subjects = dict()
    for message_row in cur :
        subjects[message_row[0]] = message_row[1]
    
    # cur.execute('SELECT id, guid,sender_id,subject_id,headers,body FROM Messages')
    cur.execute('SELECT id, guid,sender_id,subject_id,sent_at FROM Messages')
    messages = dict()
    for message_row in cur :
        messages[message_row[0]] = (message_row[1],message_row[2],message_row[3],message_row[4])
    
    print("Loaded messages=",len(messages),"subjects=",len(subjects),"senders=",len(senders))
    
    sendcounts = dict()
    sendorgs = dict()
    for (message_id, message) in list(messages.items()):
        sender = message[1]
        sendcounts[sender] = sendcounts.get(sender,0) + 1
        pieces = senders[sender].split("@")
        if len(pieces) != 2 : continue
        dns = pieces[1]
        sendorgs[dns] = sendorgs.get(dns,0) + 1
    
    print('')
    print('Top',howmany,'Email list participants')
    
    x = sorted(sendcounts, key=sendcounts.get, reverse=True)
    for k in x[:howmany]:
        print(senders[k], sendcounts[k])
        if sendcounts[k] < 10 : break
    
    print('')
    print('Top',howmany,'Email list organizations')
    
    x = sorted(sendorgs, key=sendorgs.get, reverse=True)
    for k in x[:howmany]:
        print(k, sendorgs[k])
        if sendorgs[k] < 10 : break
    
  • 相关阅读:
    解决is not a supported wheel on this platform-解决pip has no attribute pep425tags-解决网上旧教程不适用的问题
    语音信号实验1-基于时域分析技术的语音识别
    解决 ModuleNotFoundError: No module named 'pip'
    Typecho如何上传模板和插件
    获取图片的URL链接(Typecho修改背景图片等都需要URL链接)
    ubuntu云服务器安装anaconda+jupyter配置
    ORACLE用户密码过期啦
    关于Nacos启动报如下错:nacos no javac in (usr/lib/jvm-1.8.0)
    记录一次华为云服务器给根目录扩容
    记录一次NFS快照es集群索引备份
  • 原文地址:https://www.cnblogs.com/maimai-d/p/12775908.html
Copyright © 2011-2022 走看看