zoukankan      html  css  js  c++  java
  • 解析HTML文件

      1 #!/usr/bin/env python3
      2 
      3 # -*- coding: UTF-8 -*-
      4 
      5 from bs4 import BeautifulSoup
      6 import operator
      7 import os,shutil
      8 import re
      9 
     10 def processhtml(item):
     11   html_path = item
     12   with open(html_path) as fp:
     13     soup = BeautifulSoup(fp, "html.parser")
     14   return soup
     15 
     16 def IsComputer(soup_arg):
     17   soup = soup_arg
     18   result = False
     19   try:
     20     value = soup.find('input', {'name':'资源类型'}).get('value')
     21     if value == '主机':
     22       print('资源类型:主机')
     23       result = True
     24     elif value == '数据库':
     25       print('资源类型:数据库')
     26     else:
     27       print('资源类型:其他')
     28   except:
     29     pass
     30   return result
     31 
     32 def IsAgree(soup_arg):
     33   soup = soup_arg
     34   result = False
     35   try:
     36     for row in soup.findAll('tr'):
     37       cells = row.findAll('td')
     38       if len(cells) == 4:
     39         if cells[1].findChild("font") != None:
     40           nStr = ""
     41           nStr = nStr.join(cells[0].string)
     42           target = ['帐号管理人员处理']
     43           if (operator.eq(nStr.split(), target)):
     44             print(cells[1].font.string)
     45           if (operator.eq(nStr.split(), target) and cells[1].font.string == '同意'):
     46             print("满足条件为:%s && 审批意见(同意)" % nStr.split()[0])
     47             result = True
     48   except IndexError as e:
     49     pass
     50   return result
     51 
     52 def IsIntersect(soup_arg):
     53   soup = soup_arg
     54   result = False
     55   try:
     56     value = soup.find('input', {'name':'239385_资源名称'}).get('value')
     57     temp_list = re.split('[、:
    ]', value)
     58     hosts_list = []
     59     hosts_list.clear()
     60     for hostlist in temp_list:
     61       if re.search('[a-z]', hostlist):
     62         print(hostlist)
     63         hosts_list.append(hostlist)
     64     hosts_set = set(hosts_list)
     65     if target_hosts.intersection(hosts_set):
     66       print('非空,有交集')
     67       result = True
     68     else:
     69       print("空,无交集")
     70   except:
     71     pass
     72   return result
     73 
     74 def IsIntersect2(soup_arg):
     75   soup = soup_arg
     76   result = False
     77   try:
     78     value = soup.find('input', {'name':'所在的硬件设备/软件平台'}).get('value')
     79     temp_list = re.split('[、:
    ]', value)
     80     hosts_list = []
     81     hosts_list.clear()
     82     for hostlist in temp_list:
     83       if re.search('[a-z]', hostlist):
     84         hosts_list.append(hostlist)
     85     hosts_set = set(hosts_list)
     86     if target_hosts.intersection(hosts_set):
     87       print('非空,有交集')
     88       result = True
     89     else:
     90       print("空,无交集")
     91   except:
     92     pass
     93   return result
     94 
     95 if __name__ == '__main__':
     96   target_hosts = {'cmszsoaa', 'cmszsoab', 'cmszdcss', 'cmszicss', 'cmsznpsa', 'cmsznpsb', 'cmszinta', 'cmszintb',
     97           'cmszdpsa', 'cmszdpsb', 'mcbsoaa', 'mcbsoab', 'mcbinta', 'mcbintb', 'mcbdpsa', 'mcbdpsb',
     98           'mcbnpsa', 'mcbnpsb', 'mcbdcss', 'mcbicss', 'newdcss', 'newicss'}
     99 
    100   work_dir = '/root/XmlOut/'
    101   target_dir = '/root/AccountOut/'
    102 
    103   for parent, dirnames, filenames in os.walk(work_dir, followlinks=True):
    104     for filename in filenames:
    105       file_path = os.path.join(parent, filename)
    106       print("filename with full path: %s" % file_path)
    107       soup = processhtml(file_path)
    108       flag1 = IsComputer(soup)
    109       flag2 = IsAgree(soup)
    110       flag3 = IsIntersect(soup)
    111       flag4 = IsIntersect2(soup)
    112       if (flag1 and flag2 and (flag3 or flag4)):
    113         print('%s, ok----' % (file_path))
    114         shutil.copy(file_path, target_dir)
  • 相关阅读:
    个人网址收集
    使用 TListView 控件(2)
    C# 语法练习(14): 类[六] 事件
    如何在 "万一的 Delphi 博客" 回复自动格式化的着色代码?
    使用 TListView 控件(1)
    C# 语法练习(13): 类[五] 索引器
    使用 TListView 控件(4)
    C# 语法练习(11): 类[三] 构造函数、析构函数、base、this
    C# 语法练习(12): 类[四] 抽象类与抽象成员、密封类与密封成员
    使用 TListView 控件(3)
  • 原文地址:https://www.cnblogs.com/donggongdechen/p/9444197.html
Copyright © 2011-2022 走看看