zoukankan      html  css  js  c++  java
  • 爬取全部的校园新闻

    一、从新闻url获取点击次数,获取新闻发布时间及类型转换,并整理成函数

    • 获取新闻URL点击次数
    1 # 获取新闻点击次数
    2 def get_click_num(arg_news_url):
    3     click_id = re.findall("(\d{1,5})", arg_news_url)[-1]          # findall函数返回正则表达式在字符串中匹配的结果列表
    4     click_url = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(click_id)
    5     response = requests.get(click_url)
    6     news_click_num = int(response.text.split('.html')[-1].lstrip("('").rstrip("');"))  # lstrip,rstrip函数分别去除相应匹配的头,尾字符
    7     return news_click_num
    • 获取新闻发布时间及类型转换
    1 # 获取新闻发布时间及类型转换
    2 def get_news_date(showinfo):
    3     news_date = showinfo.split()[0].split(':')[1]
    4     news_time = showinfo.split()[1]
    5     news_dt = news_date + " " + news_time
    6     dt = datetime.strptime(news_dt, "%Y-%m-%d %H:%M:%S")
    7     return dt

    二、从新闻url获取新闻详情

     1 # 获取相应URL的新闻标题,调用获取新闻发布时间、新闻的点击次数的函数
     2 def get_news_detail(arg_news_url):
     3     news_detail = {}
     4     response = requests.get(arg_news_url)
     5     response.encoding = 'utf-8'
     6     soup = BeautifulSoup(response.text, 'html.parser')
     7     news_detail['NewsTitle'] = soup.select('.show-title')[0].text
     8     showinfo = soup.select('.show-info')[0].text
     9     news_detail['NewsDateTime'] = get_news_date(showinfo)
    10     news_detail['NewsClickNum'] = get_click_num(arg_news_url)
    11     return news_detail

    三、从列表页的url获取新闻url

     1 # 获取22~32页所有新闻的详情信息并形成列表
     2 def get_news_list():
     3     news_list = []
     4     page_urls = get_page_urls()
     5     for page_url in page_urls:
     6         sleep()
     7         response = requests.get(page_url)
     8         response.encoding = 'utf-8'
     9         soup = BeautifulSoup(response.text, 'html.parser')
    10         for news in soup.select('li'):
    11             if len(news.select('.news-list-title')) > 0:
    12                 news_url = news.select('a')[0]['href']                     # 链接
    13                 news_desc = news.select('.news-list-description')[0].text  # 描述
    14                 news_dict = get_news_detail(news_url)
    15                 news_dict['NewsUrl'] = news_url
    16                 news_dict['Description'] = news_desc
    17                 news_list.append(news_dict)
    18     return news_list

    四、获取所有页的URL,再用页的url并获取全部新闻 

    1 # 获取校园新闻第22~32页的URL,并存到列表里
    2 def get_page_urls():
    3     page_urls = []
    4     for i in range(22, 32):
    5         page_url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    6         page_urls.append(page_url)
    7     return page_urls

    五、设置合理的爬取间隔

    1 def sleep():
    2     time.sleep(random.random() * 3)
    3     return

    六、用pandas做简单的数据处理并保存

    1 def save_file():
    2     news_df = pd.DataFrame(get_news_list())
    3     print(news_df)
    4     news_df.to_csv(r'f:news.csv')                                          # 生成csv文件保存到f盘
    5     return

    七、运行效果

    八、完整源码

     1 import re
     2 import time
     3 import random
     4 import requests
     5 import pandas as pd
     6 from bs4 import BeautifulSoup
     7 from datetime import datetime
     8 
     9 
    10 # 获取新闻点击次数
    11 def get_click_num(arg_news_url):
    12     click_id = re.findall("(\d{1,5})", arg_news_url)[-1]          # findall函数返回正则表达式在字符串中匹配的结果列表
    13     click_url = "http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80".format(click_id)
    14     response = requests.get(click_url)
    15     news_click_num = int(response.text.split('.html')[-1].lstrip("('").rstrip("');"))  # lstrip,rstrip函数分别去除相应匹配的头,尾字符
    16     return news_click_num
    17 
    18 
    19 # 获取新闻发布时间及类型转换
    20 def get_news_date(showinfo):
    21     news_date = showinfo.split()[0].split(':')[1]
    22     news_time = showinfo.split()[1]
    23     news_dt = news_date + " " + news_time
    24     dt = datetime.strptime(news_dt, "%Y-%m-%d %H:%M:%S")
    25     return dt
    26 
    27 
    28 # 获取相应URL的新闻标题,调用获取新闻发布时间、新闻的点击次数的函数
    29 def get_news_detail(arg_news_url):
    30     news_detail = {}
    31     response = requests.get(arg_news_url)
    32     response.encoding = 'utf-8'
    33     soup = BeautifulSoup(response.text, 'html.parser')
    34     news_detail['NewsTitle'] = soup.select('.show-title')[0].text
    35     showinfo = soup.select('.show-info')[0].text
    36     news_detail['NewsDateTime'] = get_news_date(showinfo)
    37     news_detail['NewsClickNum'] = get_click_num(arg_news_url)
    38     return news_detail
    39 
    40 
    41 # 获取校园新闻第22~32页的URL,并存到列表里
    42 def get_page_urls():
    43     page_urls = []
    44     for i in range(22, 32):
    45         page_url = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i)
    46         page_urls.append(page_url)
    47     return page_urls
    48 
    49 
    50 # 获取22~32页所有新闻的详情信息并形成列表
    51 def get_news_list():
    52     news_list = []
    53     page_urls = get_page_urls()
    54     for page_url in page_urls:
    55         sleep()
    56         response = requests.get(page_url)
    57         response.encoding = 'utf-8'
    58         soup = BeautifulSoup(response.text, 'html.parser')
    59         for news in soup.select('li'):
    60             if len(news.select('.news-list-title')) > 0:
    61                 news_url = news.select('a')[0]['href']                     # 链接
    62                 news_desc = news.select('.news-list-description')[0].text  # 描述
    63                 news_dict = get_news_detail(news_url)
    64                 news_dict['NewsUrl'] = news_url
    65                 news_dict['Description'] = news_desc
    66                 news_list.append(news_dict)
    67     return news_list
    68 
    69 
    70 def sleep():
    71     time.sleep(random.random() * 3)
    72     return
    73 
    74 
    75 def save_file():
    76     news_df = pd.DataFrame(get_news_list())
    77     print(news_df)
    78     news_df.to_csv(r'f:news.csv')                                          # 生成csv文件保存到f盘
    79     return
    80 
    81 
    82 save_file()
    View Code
  • 相关阅读:
    【链接】idea没有svn
    移动端适配解决方案
    项目中多个文件引入同一份公共样式less文件导致编译打包后有多份样式
    2020最新编辑器集成eslint、prettier、stylelint,git提交预检查代码配置
    eslint+prettier学习
    artTempate模版继承父模版之后再引入子模版不生效?
    iOS底层原理开篇
    iOS应用千万级架构:安全与加密
    使用RabbitMQ实现延迟任务
    JAVA三元运算符空指针引用的坑
  • 原文地址:https://www.cnblogs.com/trojans/p/10697512.html
Copyright © 2011-2022 走看看