zoukankan      html  css  js  c++  java
  • 爬虫案例-爬运维工单

    源代码:

    # coding=utf-8
    import requests
    from lxml import etree
    
    
    class ChaxunSpdier:
        def __init__(self):
            self.start_url = 'http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=performQuery'
            self.part_url = 'http://111.40.232.237:9000/eoms35/sheet/complaint/'
            self.headers = {
                'Connection': 'keep-alive',
                'Cookie': 'TSJSESSIONID=0000YvxNFfPYx8EBo8lsKNrKIl6:1bkt8lo7d',#每次都得换一下
                'Host': '111.40.232.237:9000',
                'Referer': 'http://111.40.232.237:9000/eoms35/sheet/complaint/complaint.do?method=showQueryPage&type=interface&urlType=complaint&userName=liuhaoce&workSerial=0&isDutyMaster=false&workSerialTime=&startDuty=&endDuty=',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'}
    
        def parse_url(self, url):
            formdata = {
                'sheetIdStringExpression': 'like',
                'main.sheetId': '',  # 工单流水号
                'titleStringExpression': 'like',
                'main.title': '',
                'main.status': '',
                'statusChoiceExpression': '0',
                'task.taskName': '',
                'sendRoleIdStringExpression': 'in',
                'main.sendRoleId': '',
                'sendDeptIdStringExpression': 'in',
                'main.sendDeptId': '',
                'sendUserIdStringExpression': 'in',
                'main.sendUserId': '',
                'operateRoleIdStringExpression': 'in',
                'link.operateRoleId': '',
                'operateDeptIdStringExpression': 'in',
                'link.operateDeptId': '',
                'operateUserIdStringExpression': 'in',
                'link.operateUserId': '',
                'toDeptIdStringExpression': 'in',
                'showArea': '大庆, 铁通',  # 投诉受理省份
                'main.toDeptId': '1005, 1021',
                'main.complaintType1': '',
                'complaintType1ChoiceExpression': '1010615100202',  # 投诉类型一:家宽业务
                'main.complaintType2': '',
                'complaintType2ChoiceExpression': '',
                'main.complaintType': '',
                'main.complaintType4': '',
                'main.complaintType5': '',
                'main.complaintType6': '',
                'main.complaintType7': '',
                'complaintNumStringExpression': '',
                'main.complaintNum': '',
                'parentCorrelationStringExpression': '',
                'main.parentCorrelation': '',
                'customAttributionStringExpression': 'like',
                'main.customAttribution': '',
                'repeatComplaintTimesStringExpression': '>=',
                'main.repeatComplaintTimes': '',
                'complaintDescStringExpression': 'like',
                'main.complaintDesc': '',
                'main.sendTime': '',
                'sendTimeStartDateExpression': '>=',
                'sendTimeStartDate': '2020-02-02 20:13:35',  # 开始时间
                'sendTimeLogicExpression': 'and',
                'sendTimeEndDateExpression': '<=',
                'sendTimeEndDate': '2020-02-23 20:13:35',  # 结束时间
                'queryType': 'record'
            }
            response = requests.post(url, data=formdata, headers=self.headers)
            return response.content
    
        def get_content_list(self, html_raw):
            html = etree.HTML(html_raw)
            tr_list = html.xpath('//tbody/tr')  # 每一个tr里放了一行投诉
            content_list = []
            for content in tr_list:
                item = {}
                zineirong = content.xpath('./td')  # 每行投诉都封装在n个td标签下
                item['工单主题'] = zineirong[0].xpath('.//text()')[0]
                item['工单流水号'] = zineirong[1].xpath('./a/text()')[0]
                # item['处理时限'] = zineirong[3].xpath('./text()')[0]
                detail_link = self.part_url + zineirong[1].xpath('./a/@href')[0]
                detail_dict = self.get_gongdan_detail(detail_link)
                item['xiangqing'] = detail_dict
                content_list.append(item)
            next_gongdan_url = self.part_url + html.xpath("//a[text()='下一页']/@href")[0] if len(html.xpath("//a[text()='下一页']/@href")) > 0 else None  # 下一页工单列表明细
            return content_list, next_gongdan_url
    
        def get_gongdan_detail(self, url):
            html_raw = self.parse_url(url)
            html = etree.HTML(html_raw)
            xiangqing_dict = {}
            xiangqing_dict['投诉内容'] = html.xpath('//*[@id="complainttext"]/text()')
            xiangqing_dict['派往对象'] = html.xpath('//div[@id="ext-gen47"]/table/tbody/tr[4]/td[4]/text()')#ifram里了,查不到
            xiangqing_dict['qita'] = html.xpath('//*[@id="ext-gen47"]/text()')
    
            return xiangqing_dict
    
        def save_content_list(self, content_list):
            for i, v in enumerate(content_list, start=1):
                print(i, v)
    
        def run(self):
            next_url = self.start_url#工单查询主界面
            content_total_list = []
            while next_url is not None:
                html_raw = self.parse_url(next_url)  # 获取访问每一页工单源数据
                content_list, next_url = self.get_content_list(html_raw)  # 提取url具体内容放在里列表里,获取下一页链接
                content_total_list = content_total_list + content_list  #将提取每一页内容加载到列表中
            self.save_content_list(content_total_list)  # 每一条工单内容打印一下
    
    if __name__ == '__main__':
        Spdier = ChaxunSpdier()
        Spdier.run()
  • 相关阅读:
    Codeforces Round #601 (Div. 2)
    A. A Serial Killer
    B. Sherlock and his girlfriend
    Codeforces Round #600 (Div. 2)
    Manthan, Codefest 19 (open for everyone, rated, Div. 1 + Div. 2) C. Magic Grid
    7213:垃圾炸弹
    2011
    Educational Codeforces Round 46 (Rated for Div. 2)
    Stall Reservations
    Pots
  • 原文地址:https://www.cnblogs.com/iamorz/p/12358379.html
Copyright © 2011-2022 走看看