zoukankan      html  css  js  c++  java
  • 解析HTML文件

      1 #!/usr/bin/env python3
      2 
      3 # -*- coding: UTF-8 -*-
      4 
      5 from bs4 import BeautifulSoup
      6 import operator
      7 import os,shutil
      8 import re
      9 
     10 def processhtml(item):
     11   html_path = item
     12   with open(html_path) as fp:
     13     soup = BeautifulSoup(fp, "html.parser")
     14   return soup
     15 
     16 def IsComputer(soup_arg):
     17   soup = soup_arg
     18   result = False
     19   try:
     20     value = soup.find('input', {'name':'资源类型'}).get('value')
     21     if value == '主机':
     22       print('资源类型:主机')
     23       result = True
     24     elif value == '数据库':
     25       print('资源类型:数据库')
     26     else:
     27       print('资源类型:其他')
     28   except:
     29     pass
     30   return result
     31 
     32 def IsAgree(soup_arg):
     33   soup = soup_arg
     34   result = False
     35   try:
     36     for row in soup.findAll('tr'):
     37       cells = row.findAll('td')
     38       if len(cells) == 4:
     39         if cells[1].findChild("font") != None:
     40           nStr = ""
     41           nStr = nStr.join(cells[0].string)
     42           target = ['帐号管理人员处理']
     43           if (operator.eq(nStr.split(), target)):
     44             print(cells[1].font.string)
     45           if (operator.eq(nStr.split(), target) and cells[1].font.string == '同意'):
     46             print("满足条件为:%s && 审批意见(同意)" % nStr.split()[0])
     47             result = True
     48   except IndexError as e:
     49     pass
     50   return result
     51 
     52 def IsIntersect(soup_arg):
     53   soup = soup_arg
     54   result = False
     55   try:
     56     value = soup.find('input', {'name':'239385_资源名称'}).get('value')
     57     temp_list = re.split('[、:
    ]', value)
     58     hosts_list = []
     59     hosts_list.clear()
     60     for hostlist in temp_list:
     61       if re.search('[a-z]', hostlist):
     62         print(hostlist)
     63         hosts_list.append(hostlist)
     64     hosts_set = set(hosts_list)
     65     if target_hosts.intersection(hosts_set):
     66       print('非空,有交集')
     67       result = True
     68     else:
     69       print("空,无交集")
     70   except:
     71     pass
     72   return result
     73 
     74 def IsIntersect2(soup_arg):
     75   soup = soup_arg
     76   result = False
     77   try:
     78     value = soup.find('input', {'name':'所在的硬件设备/软件平台'}).get('value')
     79     temp_list = re.split('[、:
    ]', value)
     80     hosts_list = []
     81     hosts_list.clear()
     82     for hostlist in temp_list:
     83       if re.search('[a-z]', hostlist):
     84         hosts_list.append(hostlist)
     85     hosts_set = set(hosts_list)
     86     if target_hosts.intersection(hosts_set):
     87       print('非空,有交集')
     88       result = True
     89     else:
     90       print("空,无交集")
     91   except:
     92     pass
     93   return result
     94 
     95 if __name__ == '__main__':
     96   target_hosts = {'cmszsoaa', 'cmszsoab', 'cmszdcss', 'cmszicss', 'cmsznpsa', 'cmsznpsb', 'cmszinta', 'cmszintb',
     97           'cmszdpsa', 'cmszdpsb', 'mcbsoaa', 'mcbsoab', 'mcbinta', 'mcbintb', 'mcbdpsa', 'mcbdpsb',
     98           'mcbnpsa', 'mcbnpsb', 'mcbdcss', 'mcbicss', 'newdcss', 'newicss'}
     99 
    100   work_dir = '/root/XmlOut/'
    101   target_dir = '/root/AccountOut/'
    102 
    103   for parent, dirnames, filenames in os.walk(work_dir, followlinks=True):
    104     for filename in filenames:
    105       file_path = os.path.join(parent, filename)
    106       print("filename with full path: %s" % file_path)
    107       soup = processhtml(file_path)
    108       flag1 = IsComputer(soup)
    109       flag2 = IsAgree(soup)
    110       flag3 = IsIntersect(soup)
    111       flag4 = IsIntersect2(soup)
    112       if (flag1 and flag2 and (flag3 or flag4)):
    113         print('%s, ok----' % (file_path))
    114         shutil.copy(file_path, target_dir)
  • 相关阅读:
    CentOS查看CPU信息、位数、多核信息
    Linux常用命令大全
    chmod命令详细用法
    tar命令的详细解释
    yum和rpm命令详解
    LeetCode 241. Different Ways to Add Parentheses
    LeetCode 139. Word Break
    LeetCode 201. Bitwise AND of Numbers Range
    LeetCode 486. Predict the Winner
    LeetCode 17. Letter Combinations of a Phone Number
  • 原文地址:https://www.cnblogs.com/donggongdechen/p/9444197.html
Copyright © 2011-2022 走看看