zoukankan      html  css  js  c++  java
  • Python爬取ithome的一所有新闻标题评论数及其他一些信息并存入Excel中。

     1 # coding=utf-8
     2 import numpy as np
     3 import pandas as pd
     4 import sys
     5 
     6 from selenium import webdriver
     7 import time
     8 import requests
     9 import re
    10 from openpyxl.workbook import Workbook
    11 import matplotlib.pyplot as plt
    12 import matplotlib
    13 
    14 urls = []
    15 urls_new = []
    16 titles = []
    17 titles_new = []
    18 days = []
    19 comments = []
    20 authors = []
    21 sources = []
    22 comment = []
    23 ty = []
    24 def save_to_file(file_name, contents):
    25     fh = open(file_name, 'w')
    26     fh.write(contents)
    27     fh.close()
    28 
    29 url="https://www.ithome.com/"
    30 # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
    31 #                          '(KHTML,like Gecko) Chrome/50.0.2661.102 Safari/537.36 QIHU 360EE'}
    32 headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}
    33 rep = requests.get(url,headers=headers)
    34 rep.encoding="utf-8"
    35 strw=rep.text
    36 save_to_file('ithome.html', strw)
    37 p = re.compile(r'<div class="lst lst-1 new-list">(.*?)</div>s*?</div>')
    38 m = p.findall(strw)
    39 print(len(m[0]))
    40 p = re.compile(r'<div class="block d{4} new-list-d{1}"(?: style=".*?")?><ul>(.*?)</ul></div>')
    41 m2 = p.findall(m[0])
    42 print(len(m2))
    43 
    44 broswer = webdriver.Chrome('D:谷歌GoogleChromeApplicationchromedriver.exe')
    45 
    46 for i in m2:
    47     m2 = re.findall(r'</span><span class="title">.*?href="(.*?)">(?:<.*?>)?(.*?)(?:</font>)?</a></span></li>', i)
    48     for j in m2:
    49         urls.append(j[0])
    50         titles.append(j[1])
    51 print(len(urls))
    52 for i in range(len(urls)):
    53     print(u'读取中' + urls[i])
    54     broswer.get(urls[i])
    55     time.sleep(1)
    56     strw2 = broswer.page_source
    57     # print(strw2)
    58     p2 = re.compile(r'https://w+?.ithome.com/(?:html/)?(.*?)/.*?')
    59     m2 = p2.findall(urls[i])
    60     print(m2)
    61     p = re.compile(u'<span id="pubtime_baidu">(d*-d*-d*).*?</span><span id="source_baidu">'
    62                    u'来源:<a href=".*?" .*?>(.*?)</a></span><span id="author_baidu">'
    63                    u'作者:(?:<strong>)?(.*?)(?:</strong>)?</span>.*?<span id="commentcount">(.*?)</span>')
    64     m = p.findall(strw2)
    65     print(m)
    66     if len(m) > 0:
    67         days.append(m[0][0])
    68         sources.append(m[0][1])
    69         authors.append(m[0][2])
    70         urls_new.append(urls[i])
    71         comments.append(m[0][3])
    72         titles_new.append(titles[i])
    73         ty.append(m2[0])
    74 print("读取结束")
    75 data={'日期':days,'作者':authors,'来源':sources,'标题':titles_new,'链接':urls_new,'评论数量':comments,'新闻类型':ty}
    76 df = pd.DataFrame(data, columns=['日期', '作者', '来源','标题','链接','评论数量','新闻类型'])
    77 # print(df)
    78 df.to_excel(r'ShuJuPa.xlsx',sheet_name='数据爬取结果',encoding='gb2312')
  • 相关阅读:
    写一个工具生成数据库实体类
    自己写一个java的mvc框架吧(三)
    自己写一个java的mvc框架吧(二)
    自己写一个java的mvc框架吧(一)
    手把手教你写一个java的orm(完)
    JavaEE系列之(二)commons-fileupload实现文件上传、下载
    JavaEE系列之(一)JSP基础知识详解
    Servlet---JavaWeb技术的核心基础,JavaWeb框架的基石(二)
    Servlet---JavaWeb技术的核心基础,JavaWeb框架的基石(一)
    cygwin简介及使用
  • 原文地址:https://www.cnblogs.com/pangzx/p/9371315.html
Copyright © 2011-2022 走看看