zoukankan      html  css  js  c++  java
  • 数据结构化与保存

    import requests
    import re
    from bs4 import BeautifulSoup
    
    
    def uslHtml(url):
        res = requests.get(url)
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, "html.parser")
        return soup
    def page(url):
       soup = uslHtml(url)
       newsList = soup.select(".news-list")[0].select("li")
       for aList in newsList:
          a = aList.select("a")[0].attrs["href"]
          number = re.search("_(d+)/(d+)",a).group(0)
          ress = requests.get(a)
          ress.encoding="utf-8"
          soup1 = BeautifulSoup(ress.text,"html.parser")
          content = soup1.select("#content")[0].text
          f.write(content)
          print(content)
    so = uslHtml("http://news.gzcc.cn/html/xiaoyuanxinwen/")
    n = int(so.select("#pages")[0].select(".a1")[0].text.strip("条"))
    n=int(n/10)+1
    f = open("SchoolNews.txt","a+",encoding='utf-8')
    page("http://news.gzcc.cn/html/xiaoyuanxinwen/")
    for i in range(2,3):
        url1 = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i);
        page(url1)
    f.close()
    

      

    2.

    def Url(newsUrl):
      re = requests.get(newsUrl)
      re.encoding="utf-8"
      soup = BeautifulSoup(re.text,"html.parser")
      return soup
    
    newsArr= []
    
    def pageNumber():
        soup = Url("http://news.gzcc.cn/html/xiaoyuanxinwen/")
        newsPage = int(soup.select("#pages")[0].select(".a1")[0].text.rstrip(""))
        newsPage = int((newsPage/10)+1)
        return newsPage
    
    def site():
        newsPage = pageNumber()
        for i in (newsPage,newsPage+1):
            othersUrl = "http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html".format(i)
            soup = Url(othersUrl)
            News(soup)
    
    def News(soup):
        newsList = soup.select(".news-list-text")
        for i in range(len(newsList)):
           dict = {}
           news = newsList[i]
           newsTitle = news.select(".news-list-title")[0].text
           dict["title"] = newsTitle
           newsDescription = news.select(".news-list-description")[0].text
           dict["description"] = newsDescription
           newsArr.insert(len(newsArr),dict)
        print(newsArr)
    site()

    3.

    df = pandas.DataFrame(newsArr)
    df.to_excel("title.xlsx")
     print(df[['click', 'title', 'sources']].head(6))
    
      print(df[(df['click'] > 3000) & (df['sources'] == '学校综合办')])
    
      sou = ['国际学院', '学生工作处']
      print(df[df['sources'].isin(sou)])
  • 相关阅读:
    使用hugo在gitee上写blog
    golang初识2
    golang初识1
    install go on ubuntu
    sql优化的几种方式
    UpdatePanel 无刷新弹出窗口
    .net web 点击链接在页面指定位置显示DIV的问题
    重建主键
    sql 日期时间格式转换
    UpdatePanel无法直接弹出窗口的解决
  • 原文地址:https://www.cnblogs.com/linweicong/p/8857122.html
Copyright © 2011-2022 走看看