zoukankan      html  css  js  c++  java
  • python zip用法

    import requests
    
    url = "https://magi.com/search"
    
    querystring = {"q":"堕却乡"}
    
    headers = {
        'authority': "magi.com",
        'pragma': "no-cache",
        'cache-control': "no-cache,no-cache",
        'upgrade-insecure-requests': "1",
        'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
        'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        'referer': "https://magi.com/search?q=%E7%89%B9%E6%96%AF%E6%8B%89",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9",
        'cookie': "acw_tc=7af6142615735221487104171e68298facdedf1e07add2205636582990",
        'Postman-Token': "dda0d475-41b9-44b4-812a-6dd489fe19dd,64d3ddc4-7036-4c42-bff6-53dcbc065db2",
        'Host': "magi.com",
        'Connection': "keep-alive"
        }
    
    response = requests.request("GET", url,
                                headers=headers,
                                params=querystring,
                                # verify=True
                                )
    
    # print(response.text)
    import lxml.etree
    taxt = lxml.etree.HTML(response.text)
    cells=taxt.xpath("//main//div[@data-type='fact']//article[@class='fact']")
    for cell in cells:
        sop = cell.xpath(".//dl/dd//text()")
        sop_url = cell.xpath(".//div/ul//ol//li//a//@href")
        reliability = cell.xpath(".//div//span//text()")
        import re
    
        reliability=re.findall("(d{1,3})",reliability[0])[-1]
        print(reliability,sop,sop_url)
    
    
    
    #
    import requests
    
    url = "https://www.tuicool.com/articles/jiyEnq7"
    
    headers = {
        # 'Connection': "keep-alive",
        # 'Pragma': "no-cache",
        # 'Cache-Control': "no-cache",
        # 'Upgrade-Insecure-Requests': "1",
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
        # 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
        # 'Referer': "https://magi.com/",
        # 'Accept-Encoding': "gzip, deflate",
        # 'Accept-Language': "zh-CN,zh;q=0.9",
        # 'Cookie': "Hm_lvt_28af3b8ab090b6821eea60c696e82b96=1573539698,1573540931; Hm_lpvt_28af3b8ab090b6821eea60c696e82b96=1573540931; ALLYESID4=128D850DE1E5CFA6; wdcid=11576d250e703f68; wdses=274efe2ee2728bdc; zycna=XkZbSr7Ily0BAXPBvUrZ6/aL; wdlast=1573540932",
        # 'Postman-Token': "3fb8dcac-17e0-431b-bc1e-209ab1e7c2dd,86d4a803-c79a-4949-ac91-1edd3323465e",
        # 'Host': "www.ce.cn",
        # 'cache-control': "no-cache"
        }
    spo=['特斯拉', '电池供应商', '松下']
    import chardet
    response = requests.request("GET", url, headers=headers)
    response.encoding=chardet.detect((response.content))["encoding"]
    # response.encoding="utf-8"
    import lxml.etree
    taxt = lxml.etree.HTML(response.text)
    list_sentence=taxt.xpath("//body//text()")
    import re
    # for t in t_list:
    #     t = re.split("(。|!|?)",t)
    #     if len(t)>1:
    #         t = ["".join(i) for i in zip(t[0::2], t[1::2])]
    #     print(t)
    spo_sentence = []
    for sentence in list_sentence:
        sentence_list = re.split("(。|!|?)", sentence)
        if len(sentence_list) > 1:
            sentence_list = ["".join(i) for i in zip(sentence_list[0::2], sentence_list[1::2])]
        for sentence in sentence_list:
            if spo[1]=="描述" or spo[1]=="标签" or spo[1]=="近义项":
                if sentence.find(spo[0])!=-1 and sentence.find(spo[2])!=-1:
                    spo_sentence.append(sentence)
                    print(sentence)
            else:
                if sentence.find(spo[0])!=-1 and sentence.find(spo[1])!=-1 and sentence.find(spo[2])!=-1:
                    spo_sentence.append(sentence)
                    print(sentence)
    if spo_sentence:
        item = {
            "spo_sentence": spo_sentence,
            "spo": spo
        }
        print(item)
    

      

  • 相关阅读:
    Django基础
    XSS BOT编写
    Weblogic ssrf+Redis Getshell学习
    CORS与JSONP配置不当所导致的信息泄露
    VulnHub FristLeaks 1.3
    攻防世界--REVERSE新手练习区writeup
    2019第三届强网杯线下3道RW
    SCTF2019--WEB 2题复现
    文件上传--利用.htaccess绕过黑名单
    CEF 与 QML 对比
  • 原文地址:https://www.cnblogs.com/wang102030/p/11848573.html
Copyright © 2011-2022 走看看