zoukankan      html  css  js  c++  java
  • 爬取5家公司(如:阿里巴巴、京东、亚马逊、华为、贵州茅台)百度“资讯”新闻的10页内容

    将数据以MySQL存储,字段名包括:公司名、新闻标题、网址、新闻来源和时间。

    import time
    import pymysql
    import requests
    from bs4 import BeautifulSoup
    from requests import RequestException
         
         
    def get_one_page(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36'
                       + '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
            response = requests.get(url, headers=headers)
            #response.encoding = response.apparent_encoding
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
         
    def parse_one_page(c, text):
        soup = BeautifulSoup(text, 'lxml')
        titles = soup.select('.c-title > a')
        sources = soup.find_all(name='p', class_='c-author') #class是关键字,在后面加一个下划线
        companys = ['阿里巴巴','京东','亚马逊','华为','贵州茅台']
        for i in range(len(titles)):
            data = {
                'company': companys[c],
                'title': titles[i].get_text().strip(),
                'link': titles[i]['href'],
                'source': sources[i].get_text().strip().split('xa0')[0].strip(),
                'time': sources[i].get_text().strip().split('xa0')[2].strip()
            }
            yield data
    #.string只能获取单个tag的内容,若一个tag里还包含其他子孙的节点则返回None        
    #get_text()可以获取到tag中所有文本内容包括子孙tag的内容
    
    def create_sql():
        db = pymysql.connect(host='localhost',user='root',password='123456',port=3306)
        cursor = db.cursor()
        cursor.execute("CREATE DATABASE spiders DEFAULT CHARACTER SET utf8") #创建数据库spiders
        db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders')
        cursor = db.cursor() #创建数据表baidu
        sql=("CREATE TABLE baidu (company VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,link VARCHAR(255) NOT NULL,source VARCHAR(255) NOT NULL,time VARCHAR(255) NOT NULL)")
        cursor.execute(sql)
        db.close()
        
    def write_to_sql(data):
        table = 'baidu'
        keys = ', '.join(data.keys())
        values = ', '.join(['%s'] * len(data))
        sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table, keys=keys, values=values)
        db = pymysql.connect(host='localhost', user='root', password='123456', port=3306, db='spiders')
        cursor = db.cursor()
        try:
            if cursor.execute(sql, tuple(data.values())):
                print('Successful')
                db.commit()
        except:
            print('Failed')
            db.rollback()
        db.close()
    
        
    def main(c, url):
        for pn in range(0, 91, 10):
            link = url + '&x_bfe_rqs=03E80&tngroupname=organic_news&rsv_dl=news_b_pn&pn=' + str(pn)
            text = get_one_page(link)
            for item in parse_one_page(c, text):
                print(item)
                #write_to_sql(item)
    
    if __name__ == '__main__':
        #create_sql()
        companys = ['阿里巴巴','京东','亚马逊','华为','贵州茅台']
        url = "https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd={}&medium=0"
        urls = [url.format(com) for com in companys]
        for c,url in enumerate(urls): #enumerate()可同时列出下标和数据
            main(c, url)
            time.sleep(1)
  • 相关阅读:
    Qt实现模糊搜索
    Qt解析多级xml文件
    insert into
    Git忽略规则(.gitignore配置)不生效原因和解决
    搭建vue开发环境
    表单
    事件处理
    列表渲染
    条件渲染
    class与style绑定
  • 原文地址:https://www.cnblogs.com/oeong/p/11722360.html
Copyright © 2011-2022 走看看