zoukankan      html  css  js  c++  java
  • wiki页面文本挖掘

    import os,sys
    import sys
    from bs4 import BeautifulSoup
    import urllib.request
    # reload(sys)
    # sys.setdefaultencoding('utf-8')
    BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    sys.path.append(BASE_PATH)
    DATA_PATH = BASE_PATH + os.path.sep + 'scripts' + os.path.sep
    import time,configparser,re
    #from http.u_http import HttpClient
    #from data.read_wiki import read_wiki
    #from data.read_wiki import wiki
    import json
    #from User import User
    #httpClient = HttpClient()
    import requests

    def login(url,body,url1):
    s=requests.post(url=url,data=body,allow_redirects=False)
    #print(s.status_code)
    JSESSIONID=s.cookies['JSESSIONID']
    cks={
    'JSESSIONID':JSESSIONID
    }
    res=requests.get(url1,cookies=cks)
    res.encoding='utf-8'
    res=res.text
    #使用heml解释器进行解析
    soup=BeautifulSoup(res,'html.parser')
    #在soup中找到为tbody的节点
    menu=soup.findAll("tbody")
    values=','.join(str(v) for v in menu)
    soup2=BeautifulSoup(values,'html.parser')
    menu2=soup2.findAll("tr")
    values2=','.join(str(v) for v in menu2)
    soup3=BeautifulSoup(values2,'html.parser')
    menu3=soup3.findAll("td")
    values3=','.join(str(v) for v in menu3)
    soup4=BeautifulSoup(values3,'html.parser')
    soup4=str(soup4)
    timeStr = time.strftime("%Y%m%d%H%M%S", time.localtime())
    #标题
    subject = timeStr
    with open(DATA_PATH+subject,"w",encoding='utf-8') as op:
    op.write(soup4)
    #生成configparser对象
    deploy_order = configparser.ConfigParser()
    deploy_order.read('online_list.conf')#所有项目清单
    #赋值两个空列表
    deploy, plan = [], []
    plan_order=[]
    #得到模块名
    for section in deploy_order.sections():
    #-options(section) 得到该section的所有option(把模块下的所有所有项目赋值到x
    #在加进空列表 )
    [deploy.append(x) for x in deploy_order.options(section)]
    print(deploy)
    #循环deploy_order列表(所有的项目),一个一个取值
    for order in deploy:
    with open(DATA_PATH+subject,'rb') as op:
    data=op.read()
    data=data.decode('utf-8')
    #print(data)
    #data=data.decode('utf-8')
    if order in data:
    plan.append(order)
    else:
    print("%s没找到"%order)
    #'comx-bs'不发
    for line in plan:
    if line == 'comx-bs':
    plan.remove('comx-bs')
    else:
    pass
    #得到发版项目列表
    print(plan)
    for plan_name in plan:
    with open(DATA_PATH+subject,'rb') as op:
    data=op.read()
    data=data.decode('utf-8')
    res_th = plan_name+'/(.*?)</td>'
    m_th = re.findall(res_th,data,re.S)
    #print(m_th)
    plan_order.append(m_th)
    #得到发版项目版本号列表
    print(plan_order)
    plan_order2=[]
    for line in plan_order:
    #如果列表有多个参数,就取最新的
    if len(line)>1:
    plan_order2.append(line[-1])
    else:
    plan_order2.append(line)
    print(plan_order2)
    plan_order3=[]
    for x in plan_order2:
    #将列表准换成字符串
    x=''.join(x)
    #匹配数字.数字的格式
    plan_order3.append(re.findall(r'd.*d.*d',x))
    print(plan_order3)
    #将列表转换成字符串
    plan_order4=[]
    for i in plan_order3:
    i=''.join(i)
    plan_order4.append(i)
    print(plan_order4)

    def run(pageId):
    pageId=str(pageId)
    url = "http://wiki.intra.gomeplus.com/pages/viewpage.action?"+'pageId='+pageId
    print(url)
    url1='http://wiki.intra.gomeplus.com/dologin.action'
    body={
    'os_username':'wangsen',
    'os_password':'WANGs1.',
    'login':'登录',
    'os_destination':''
    }
    login(url1,body,url)
    # print(value)
    # #body如果需要就填上数据如果不需要就置空,body={}
    # body = {}
    #
    # #接口访问的方式 get或post
    # u_method = "get"
    #
    # #处理v2接口需要在header里加“Accept”
    # header = {}
    # # A451ED019F130356AEF51CB768540B86
    # #value = "A3B6F39B9F9925E6BD5D61280A787892"
    # header["Content-Type"] = "application/x-www-form-urlencoded"
    # header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
    # cookie='JSESSIONID='+value+'; doc-sidebar=300px;confluence.list.pages.cookie=list-content-tree;
    # confluence.browse.space.cookie=space-pages;Hm_lvt_4d914dda44888419a4588c6a4be8edcc=1473650378'
    # print(cookie)
    # header['Cookie'] =cookie
    # # if ApiIsV2(url):
    # # header["Content-Type"] = "application/x-www-form-urlencoded"
    # # header["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"
    #
    # #verify为验证项列表,用于检查返回内容中的关键字
    # verify = []
    #
    # postData={}
    # #执行被测接口
    # result_dict = httpClient.api_verify(url,postData,u_method,header,verify,body)
    # print(result_dict)
    #接受发版列表
    #faban_list=read_wiki(result_dict)





    #判断是不是v2接口
    # def ApiIsV2(url):
    # if "v2" in url:
    # return True
    # else:
    # return False

    #前提操作
    # def preStep(self):
    # "前提操作"
    # url = ""
    # postData = {}
    # u_method = "get"
    # header = {}
    # verify = []
    #
    # response = httpClient.api_request(url, postData, u_method, header, verify)
    # return response

    # #将公参和必填参数组合
    # def sign_str(self,data,isV2=False):
    # publicParaV1 = {
    # "ip":"0.0.0.0",
    # "appType":"1",
    # "clientOsVersion":"8.4",
    # "sortType":"0",
    # "pubPlat":"0120102002000000",
    # "appVersion":"v1.0.2.33",
    # "latitude":"39.964707",
    # "otherDevInfo":"otherDevInfo",
    # "netType":"3G",
    # "numPerPage":"5",
    # "devId":"0",
    # "clientOs":"1",
    # "mac":"00000000",
    # "lastRecordId":"0",
    # "longitude":"116.47308",
    # "pageNum":"1",
    # "order":"2",
    # "phoneType":"iPhone"
    # }
    #
    # publicParaV2 = {
    # "integrity":"full",
    # "device":"iOS/9.2.1/iPhone/IPhone12345678",
    # "app":"001/1111111111111",
    # "appVersion":"1.0.1",
    # "net":"",
    # "accessToken":"",
    # "traceId":"",
    # "jsonp":""
    # }
    # if isV2:
    # dictMerged = dict(data, **publicParaV2)
    # else:
    # dictMerged = dict(data, **publicParaV1)
    #
    # return dictMerged


    if __name__ == "__main__":

    pageId=input('wiki_number:')
    run(pageId)
  • 相关阅读:
    ios -为什么用WKWebView加载相同的html文本,有时展示的内容却不一样。
    weex
    [Objective-C 面试简要笔记]
    [iOS 基于CoreBluetooth的蓝牙4.0通讯]
    [SVN Mac自带SVN结合新浪SAE进行代码管理]
    [SVN Mac的SVN使用]
    [iOS dispatch_once创建单例]
    [iOS UI设计笔记整理汇总]
    [iOS 视频流开发-获得视频帧处理]
    [iOS OpenCV错误解决]
  • 原文地址:https://www.cnblogs.com/wangsen-123/p/6030812.html
Copyright © 2011-2022 走看看