zoukankan      html  css  js  c++  java
  • python爬取邮箱

    上次纠结了那么久。结果不用编码也是可以匹配邮箱的。

    下面是一个用队列实现,广度优先的简单爬虫代码。先就这样吧,目测暂时不会再理它了,以后有时间再修改。(又是一个烂尾。。。。。)

    View Code
      1 # -*- coding: cp936 -*-
      2  import urllib2
      3  import re
      4  from pyquery import PyQuery as pq
      5  from lxml import etree
      6   
      7  #mailpattern = re.compile('[^\._:>\\-][\w\.-]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
      8  mailpattern = re.compile('[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+')
      9  
     10  htmlcount = 0  #to count the urls
     11  maxcount = 3000 # the max count
     12  allUrls = set()
     13  allMails = set()
     14  UrlsQlist = []
     15  UrlsQdict = {}
     16  url = "http://www.163.com"
     17  fmails = open("E:/py/crawler/mailresult.txt","a")
     18  furls = open("E:/py/crawler/urlresult.txt","a")
     19  
     20  
     21   
     22   
     23  def geturls(data):#the function to get the urls in the html
     24      urls = set()
     25      if data:  
     26          d = pq(data)
     27          label_a = d.find('a')#用pyquery库去找到 a 标签.
     28          if label_a:
     29              label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))
     30              for u in label_a_href:
     31                  if u[0:10]!="javascript" :  
     32                      if u[0:4] == "http":
     33                          urls.add(u)
     34                      else:
     35                          urls.add(url + u)              
     36              #for u in urls:
     37                  #print u
     38          return urls
     39      else:
     40          return None
     41          
     42  def gethtml(url):
     43      try:
     44          fp = urllib2.urlopen(url)
     45      except:
     46          print "urllib2.urlopen error"
     47          return None
     48      else:
     49          mybytes =fp.read()
     50          fp.close()
     51          return mybytes
     52      
     53  def savemails(data): # the function to save the emails
     54      if data:
     55          mailResult = mailpattern.findall(data)
     56          mailResultset = set(mailResult)
     57          if mailResultset:
     58              allMails.update(mailResultset)
     59          
     60  def savehtml(pagecontent,count):
     61      if pagecontent != None:
     62          f = open("E:/py/crawler/html/"+str(count)+".html","w")
     63          f.write(pagecontent)
     64          f.close()
     65      else:
     66          f = open("E:/py/crawler/html/"+str(count)+"error"+".html","w")
     67          f.write("this page empty")
     68          f.close()
     69          
     70  def BFS(firstUrl):
     71      global htmlcount
     72      global maxcount
     73      allUrls.add(firstUrl)
     74      UrlsQlist = list(allUrls)
     75      while htmlcount < maxcount : #数量小于最大值
     76          tempUrl = UrlsQlist.pop(0)# the queue
     77          myWebStr = gethtml(tempUrl)
     78          savehtml(myWebStr,htmlcount)
     79          savemails(myWebStr)
     80          firstUrls_set = geturls(myWebStr)#初始页面的处理
     81          if firstUrls_set != None:
     82              allUrls.update(firstUrls_set) #记录全部 url
     83              for u in firstUrls_set:
     84                  if u not in UrlsQlist:
     85                      UrlsQlist.append(u)       
     86          htmlcount = htmlcount + 1
     87          
     88          
     89  BFS(url)
     90  for u in allMails:
     91      try:
     92          fmails.write(u)
     93          fmails.write('\n')
     94      except:
     95          continue
     96  for u in allUrls:
     97      try:
     98          furls.write(u)
     99          furls.write('\n')
    100      except:
    101          continue
    102  fmails.close()
    103  furls.close()

    2013.5.13 update

    本来想在加个多线程。。。。结果看了 好多资料 无处下手,再研究研究 ,日后再改

    加了点 url规范化。代码整理如下:

      1 import urllib2
      2 import re
      3 from pyquery import PyQuery as pq
      4 from lxml import etree
      5 import urlparse
      6 import time
      7 
      8 allUrls = set()
      9 allMails = set()
     10 urlsDownlist = []
     11 
     12 class mailCrawler:
     13     def __init__(self,mailExpression,start_url,maxcount):   
     14         ''' mailExpressoin 邮箱的正则表达式;
     15         start_url开始邮箱;
     16         maxcount最大数量'''
     17         self.mailpattern = re.compile(mailExpression)
     18         self.maxcount = maxcount
     19         self.htmlcount = 0
     20         self.UrlsQlist = []#url queue 实现广度优先
     21         self.url = start_url
     22 
     23     
     24     def url_normal(self,url):
     25         '''url 规范化 '''
     26         scheme,netloc,path,query = urlparse.urlsplit(url)[:4]
     27         netloc = netloc.lower()
     28 
     29         url.encode("utf-8")
     30 
     31         if path:
     32             path = re.sub('/{2,}','/',path)#去除url中的重复/
     33             path = re.sub(r'\.$','',path)#去除url中结尾多余的点
     34             path = re.sub('/$','',path)#去除url中结尾多余的/
     35             path = re.sub('\s','',path)#取出url中的空格
     36         if query:
     37             return '%s://%s%s?%s' % (scheme,netloc,path or '/',query)
     38         else:
     39             return '%s://%s%s' % (scheme,netloc,path)
     40 
     41     def geturls(self,data):
     42         '''解析html中的url'''
     43         urls = set()
     44         if data:  
     45             d = pq(data)
     46             label_a = d.find('a')#用pyquery库去找到 a 标签.
     47             if label_a:
     48                 label_a_href = d('a').map(lambda i,e:pq(e)('a').attr('href'))
     49                 for u in label_a_href:
     50                     if u[0:10]!="javascript" and u[0:6]!="mailto" :  
     51                         if u[0:4] == "http":
     52                             normal_url = self.url_normal(u)
     53                             urls.add(normal_url)
     54                         else:
     55                             normal_url = self.url_normal(self.url + u)
     56                             urls.add(normal_url)              
     57             return urls
     58         else:
     59             return None
     60         
     61     def gethtml(self,url):
     62         '''下载html  5s超时'''
     63         try:
     64             fp = urllib2.urlopen(url,None,5)
     65         except:
     66             print "urllib2.urlopen error  or timeout"
     67             return None
     68         else:
     69             mybytes =fp.read()
     70             fp.close()
     71             return mybytes
     72         
     73     def savemails(self,data):
     74         '''将抓取到的url存放到 allmails中 ,set去重复'''
     75         global allMails
     76         if data:
     77             mailResult = self.mailpattern.findall(data)
     78             mailResultset = set(mailResult)
     79             if mailResultset:
     80                 allMails.update(mailResultset)
     81             
     82     def savehtml(self,pagecontent,htmlcount,url):
     83         '''保存html文件 '''
     84         if pagecontent != None:
     85             f = open("E:/py/crawler/html/"+str(htmlcount)+".html","w")
     86             f.write(pagecontent)
     87             f.close()
     88         else:
     89             f = open("E:/py/crawler/html/"+str(htmlcount)+"error"+".html","w")
     90             try:
     91                 f.write(url)
     92             except:
     93                 f.write("encode error")
     94             f.close()
     95             
     96     def BFS(self):
     97         '''用队列实现广度优先,爬取url '''
     98         global allUrls
     99         global urlsDownlist
    100         allUrls.add(self.url)
    101         self.UrlsQlist = list(allUrls)
    102         while self.htmlcount < self.maxcount : #数量小于最大值
    103             tempUrl = self.UrlsQlist.pop(0)# the queue
    104             print tempUrl
    105             urlsDownlist.append(tempUrl)
    106             myWebStr = self.gethtml(tempUrl)
    107             self.savehtml(myWebStr,self.htmlcount,tempUrl)
    108             self.savemails(myWebStr)
    109             firstUrls_set = self.geturls(myWebStr)#初始页面的处理
    110             if firstUrls_set != None:
    111                 for u in firstUrls_set:
    112                     if u not in allUrls:
    113                         allUrls.add(u)
    114                         self.UrlsQlist.append(u)       
    115             self.htmlcount = self.htmlcount + 1
    116             
    117 
    118 def main():
    119     reg = r'[A-Za-z0-9_]+@(?:[A-Za-z0-9]+\.)+[A-Za-z]+'
    120     url = "http://www.baidu.com"
    121     count = 100
    122     fmails = open("E:/py/crawler/mailresult.txt","a")
    123     furls = open("E:/py/crawler/urlresult.txt","a")
    124     fdownUrls = open("E:/py/crawler/urlDownresult.txt","a")
    125     newcrawler = mailCrawler(reg,url,count)
    126     newcrawler.BFS()
    127     for u in allMails:
    128         try:
    129             fmails.write(u)
    130             fmails.write('\n')
    131         except:
    132             continue
    133     for u in allUrls:
    134         try:
    135             furls.write(u)
    136             furls.write('\n')
    137         except:
    138             continue
    139     for u in urlsDownlist:
    140         try:
    141             fdownUrls.write(u)
    142             fdownUrls.write('\n')
    143         except:
    144             continue
    145     fmails.close()
    146     furls.close()
    147     fdownUrls.close()
    148 
    149 if __name__ == '__main__':
    150     main()
  • 相关阅读:
    变量
    总结 对象
    学生管理系统
    [Altera] Device Part Number Format
    [Matlab] sum
    [Matlab] Galois Field
    [C] static和extern的作用
    [Python] list
    [Python] raw_input
    [软件] UnicornViewer
  • 原文地址:https://www.cnblogs.com/xibaohe/p/3055482.html
Copyright © 2011-2022 走看看