zoukankan      html  css  js  c++  java
  • Python_爬虫_案例汇总:

    1.豆瓣采集

     1 #coding:utf-8
     2 #采集豆瓣书信息和图片,写进数据库 
     3 
     4 from urllib import request
     5 # from bs4 import BeautifulSoup
     6 from lxml import etree
     7 import json,pymysql
     8 
     9 # from my_pymysql import pymysql
    10 
    11 url="https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4"
    12 headers={
    13     'Host':'book.douban.com',
    14     'Upgrade-Insecure-Requests':'1',
    15     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
    16 }
    17 req = request.Request(url=url,headers=headers,method="GET")
    18 content = request.urlopen(req).read().decode("utf-8")
    19 content_dict=etree.HTML(content)    #格式化
    20 # print(content_dict)
    21 content_dict_allli = content_dict.xpath(r'//*[@id="subject_list"]/ul/li')  #拿到列表
    22 info_all = ''
    23 
    24 for li in content_dict_allli:     
    25     # 书名/标题
    26     title_list = li.xpath(r'div[2]/h2/a/@title')    #取标签里的内容,注意地址是相对地址,不能直接拿来用 (注:和bs4不一样)
    27     title =title_list[0]
    28     title=title.replace(" ",'')
    29     print(title)
    30     #信息 作者、出版社
    31     info_list = li.xpath(r'div[2]/div[1]/text()') 
    32     author = info_list[0].split('/')[0]
    33     author = author.replace('
    ','').replace(" ",'')
    34     chubanshe = info_list[0].split('/')[1]
    35     print(author)
    36     print(chubanshe)
    37     #评分
    38     pingfen_list = li.xpath(r'div[2]/div[2]/span[2]/text()')
    39     pingfen = pingfen_list[0]
    40     print(pingfen)
    41 
    42     #图片
    43     img_net_addr =li.xpath(r'div[1]/a/img/@src')
    44     img_net_addr = img_net_addr[0]
    45     print(img_net_addr)
    46     data = request.urlopen(img_net_addr).read()
    47     img_name =str('douban/') + title + str('.jpg')
    48     with open(img_name,'wb')as f:
    49         f.write(data)
    50         
    51     #数据库
    52     db = pymysql.connect(host='localhost',port=3306,user="root",password='root',db='douban',charset='utf8')    #
    53     cur=db.cursor()
    54     sql = "insert into douban(title,author,chubanshe,pingfen)values('%s','%s','%s','%s')"%(title,author,chubanshe,pingfen)
    55     cur.execute(sql)
    56     db.commit()
    57 
    58 db.close()
    采集豆瓣书信息和图片;带请求头、存数据库、图片;写进数据库

    2.链家

    #coding:utf-8
    #完成,,取出链家数据存到文件里
    from urllib import request,error
    from bs4 import BeautifulSoup
    import pymysql
    
    # from my_pymysql import pymysql  #引入数据库
    #创建数据库
    db = pymysql.connect(host='localhost',user='root',password='root',db='lianjia',charset='utf8')
    cur = db.cursor()   #实例化游标
    
    for i in range(1,33):
        req=request.urlopen('https://xa.lianjia.com/ershoufang/pg'+str(i)).read().decode('utf-8')
        req_bs4 = BeautifulSoup(req,'html.parser')  #建立对象,才能用bs4
        body_ul=req_bs4.find('ul',class_="sellListContent")
        try:
            s=''
            for li in body_ul:
                # info_all = li.find('div',class_="info clear").get_text()    #全部信息
                tit = li.find('div',class_="title").get_text()  #标题
                addr = li.find('div',class_="houseInfo").get_text() #地址
                pric = li.find('div',class_="totalPrice").get_text()    #价格
                s+=tit
                s+=addr
                s+=pric
                s+='
    
    '
                print(i)    #提示采集的位置
                 # 采集图片开始++++++++++++++++++++++++++++++++++++++++++++
                img = li.find("img", class_='lj-lazy')['data-original'] #图片地址
                img_format = img.split('.')[-1]  # 用点隔开,取图片的后缀
                img_name = 'lianjia/images/' + li.find("img", class_='lj-lazy')['alt'] + '.' + img_format  # 名字
                adr = request.urlopen(img).read()  # 读取图片地址,拿到字节流形式的图片,,写进去
                try:    #;空的话就跳过
                    with open(img_name, 'wb')as f:
                        f.write(adr)
                except:
                    pass
                # 采集图片完毕----------------------------
                #存到数据库
                sql = "insert into lianjia_hotel(title,address) values ('%s','%s')"%(tit,addr)
                cur.execute(sql)
                db.commit()
        except:
            print("本页完毕~")
    #最后再关闭数据库
    db.close()
    
    #写到一个txt文件里面
    # with open('lianjia/lianjia.txt','w',encoding="utf-8")as f:
    #     f.write(s)
    链家下载,文字与图片,用bs4解析

    3.今日头条

    from selenium import webdriver
    from lxml import etree
    from pyquery import PyQuery as pq
    import time
    
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get('https://www.toutiao.com/')
    driver.implicitly_wait(10)
    driver.find_element_by_link_text('科技').click()
    driver.implicitly_wait(10)
    for x in range(3):
        js="var q=document.documentElement.scrollTop="+str(x*500)
        driver.execute_script(js)
        time.sleep(2)
    
    time.sleep(5)
    page = driver.page_source
    doc = pq(page)
    doc = etree.HTML(str(doc))
    contents = doc.xpath('//div[@class="wcommonFeed"]/ul/li')
    print(contents)
    for x in contents:
        title = x.xpath('div/div[1]/div/div[1]/a/text()')
        if title:
            title = title[0]
            with open('toutiao.txt','a+',encoding='utf8')as f:
                f.write(title+'
    ')
            print(title)
        else:
            pass
    今日头条,selenium控制翻页

    4.微信群信息(包括成员)和联系人

    # -*- coding:utf-8 -*-
    '''
    扫码登陆微信后获取该微信账号的微信群(包括群内人员)和通讯录联系人信息【注:好像不全】
    '''
    
    import os
    import re
    import time
    import sys
    import subprocess
    import requests
    import xml.dom.minidom
    import json
    
    
    # 微信登陆
    class WebwxLogin(object):
        def __init__(self):
            self.session = requests.session()
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:33.0) Gecko/20100101 Firefox/33.0'}
            self.QRImgPath = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'webWeixinQr.jpg'
            self.uuid = ''
            self.tip = 0
            self.base_uri = ''
            self.redirect_uri = ''
            self.skey = ''
            self.wxsid = ''
            self.wxuin = ''
            self.pass_ticket = ''
            self.deviceId = 'e000000000000000'
            self.BaseRequest = {}
            self.ContactList = []
            self.My = []
            self.SyncKey = ''
    
        def getUUID(self):
    
            url = 'https://login.weixin.qq.com/jslogin'
            params = {
                'appid': 'wx782c26e4c19acffb',
                'redirect_uri': 'https://wx.qq.com/cgi-bin/mmwebwx-bin/webwxnewloginpage',
                'fun': 'new',
                'lang': 'zh_CN',
                '_': int(time.time() * 1000),  # 时间戳
            }
    
            response = self.session.get(url, params=params)
            target = response.content.decode('utf-8')
    
            pattern = r'window.QRLogin.code = (d+); window.QRLogin.uuid = "(S+?)"'
            ob = re.search(pattern, target)  # 正则提取uuid
    
            code = ob.group(1)
            self.uuid = ob.group(2)
    
            if code == '200':  # 判断请求是否成功
                return True
    
            return False
    
        def showQRImage(self):
    
            url = 'https://login.weixin.qq.com/qrcode/' + self.uuid
            response = self.session.get(url)
    
            self.tip = 1
    
            with open(self.QRImgPath, 'wb') as f:
                f.write(response.content)
                f.close()
            # 打开二维码
            if sys.platform.find('darwin') >= 0:
                subprocess.call(['open', self.QRImgPath])  # 苹果系统
            elif sys.platform.find('linux') >= 0:
                subprocess.call(['xdg-open', self.QRImgPath])  # linux系统
            else:
                os.startfile(self.QRImgPath)  # windows系统
    
            print('请使用微信扫描二维码登录')
    
        def checkLogin(self):
    
            url = 'https://login.weixin.qq.com/cgi-bin/mmwebwx-bin/login?tip=%s&uuid=%s&_=%s' % (
                self.tip, self.uuid, int(time.time() * 1000))
    
            response = self.session.get(url)
            target = response.content.decode('utf-8')
    
            pattern = r'window.code=(d+);'
            ob = re.search(pattern, target)
            code = ob.group(1)
    
            if code == '201':  # 已扫描
                print('成功扫描,请在手机上点击确认登录')
                self.tip = 0
            elif code == '200':  # 已登录
                print('正在登录中...')
                regx = r'window.redirect_uri="(S+?)";'
                ob = re.search(regx, target)
                self.redirect_uri = ob.group(1) + '&fun=new'
                self.base_uri = self.redirect_uri[:self.redirect_uri.rfind('/')]
            elif code == '408':  # 超时
                pass
    
            return code
    
        def login(self):
    
            response = self.session.get(self.redirect_uri, verify=False)
            data = response.content.decode('utf-8')
    
            doc = xml.dom.minidom.parseString(data)
            root = doc.documentElement
            # 提取响应中的参数
            for node in root.childNodes:
                if node.nodeName == 'skey':
                    self.skey = node.childNodes[0].data
                elif node.nodeName == 'wxsid':
                    self.wxsid = node.childNodes[0].data
                elif node.nodeName == 'wxuin':
                    self.wxuin = node.childNodes[0].data
                elif node.nodeName == 'pass_ticket':
                    self.pass_ticket = node.childNodes[0].data
    
            if not all((self.skey, self.wxsid, self.wxuin, self.pass_ticket)):
                return False
    
            self.BaseRequest = {
                'Uin': int(self.wxuin),
                'Sid': self.wxsid,
                'Skey': self.skey,
                'DeviceID': self.deviceId,
            }
    
            return True
    
        def webwxinit(self):
    
            url = self.base_uri + 
                  '/webwxinit?pass_ticket=%s&skey=%s&r=%s' % (
                      self.pass_ticket, self.skey, int(time.time() * 1000))
            params = {
                'BaseRequest': self.BaseRequest
            }
    
            h = self.headers
            h['ContentType'] = 'application/json; charset=UTF-8'
            response = self.session.post(url, data=json.dumps(params), headers=h, verify=False)
            data = response.content.decode('utf-8')
            print(data)
    
            dic = json.loads(data)
            self.ContactList = dic['ContactList']
            self.My = dic['User']
    
            SyncKeyList = []
            for item in dic['SyncKey']['List']:
                SyncKeyList.append('%s_%s' % (item['Key'], item['Val']))
            self.SyncKey = '|'.join(SyncKeyList)
    
            ErrMsg = dic['BaseResponse']['ErrMsg']
    
            Ret = dic['BaseResponse']['Ret']
            if Ret != 0:
                return False
    
            return True
    
        def webwxgetcontact(self):
    
            url = self.base_uri + 
                  '/webwxgetcontact?pass_ticket=%s&skey=%s&r=%s' % (
                      self.pass_ticket, self.skey, int(time.time()))
    
            h = self.headers
            h['ContentType'] = 'application/json; charset=UTF-8'
            response = self.session.get(url, headers=h, verify=False)
            data = response.content.decode('utf-8')
            # print(data)
    
            dic = json.loads(data)
            MemberList = dic['MemberList']
    
            # 倒序遍历,不然删除的时候出问题..
            SpecialUsers = ["newsapp", "fmessage", "filehelper", "weibo", "qqmail", "tmessage", "qmessage", "qqsync",
                            "floatbottle", "lbsapp", "shakeapp", "medianote", "qqfriend", "readerapp", "blogapp",
                            "facebookapp", "masssendapp",
                            "meishiapp", "feedsapp", "voip", "blogappweixin", "weixin", "brandsessionholder",
                            "weixinreminder", "wxid_novlwrv3lqwv11", "gh_22b87fa7cb3c", "officialaccounts",
                            "notification_messages", "wxitil", "userexperience_alarm"]
            for i in range(len(MemberList) - 1, -1, -1):
                Member = MemberList[i]
                if Member['VerifyFlag'] & 8 != 0:  # 公众号/服务号
                    MemberList.remove(Member)
                elif Member['UserName'] in SpecialUsers:  # 特殊账号
                    MemberList.remove(Member)
                elif Member['UserName'].find('@@') != -1:  # 群聊
                    MemberList.remove(Member)
                elif Member['UserName'] == self.My['UserName']:  # 自己
                    MemberList.remove(Member)
    
            return MemberList
    
        def main(self):
            if not self.getUUID():
                print('获取uuid失败')
                return
    
            self.showQRImage()
            time.sleep(1)
    
            while self.checkLogin() != '200':
                pass
    
            os.remove(self.QRImgPath)
    
            if not self.login():
                print('登录失败')
                return
            # 登录完成, 下面查询好友
            if not self.webwxinit():
                print('初始化失败')
                return
    
            MemberList = self.webwxgetcontact()
    
            print('通讯录共%s位好友' % len(MemberList))
    
            for x in MemberList:
                sex = '未知' if x['Sex'] == 0 else '' if x['Sex'] == 1 else ''
                print('昵称:%s, 性别:%s, 备注:%s, 签名:%s' % (x['NickName'], sex, x['RemarkName'], x['Signature']))
    
    if __name__ == '__main__':
        print('开始')
        wx = WebwxLogin()
        wx.main()
    爬取微信群信息(包括成员)和联系人信息

    5.爬取淘宝固定类别商品信息+保存到mysql数据库【格式很规范】

    import requests
    import re
    import pymysql
    
    
    def getHTMLtext(url):
        try:
           r=requests.get(url,timeout=100)
           r.raise_for_status()
           r.encoding=r.apparent_encoding
           return r.text
        except:
            return ""
    def getpage(itl,html):
        try:
            plt=re.findall(r'"view_price":"[d.]*"',html)
            nlt=re.findall(r'"raw_title":".*?"',html)
            for i in range(len(plt)):
                price = eval(plt[i].split(':')[1])  # eval(fun,obj)
                title = eval(nlt[i].split(':')[1])
                itl.append([price, title])
        except:
           print("")
    
    
    def printgoods(itl):
        tplt = "{:2}	{:8}	{:16}"
        print(tplt.format("序号", "价格", "商品名称"))
    
        count = 0
        conn = pymysql.connect(host='127.0.0.1', user='root', password='123456', db='company',charset="utf8")
    
        cur = conn.cursor()
    
        sqlc = '''
                    create table coffee(
                    id int(11) not null auto_increment primary key,
                    name varchar(255) not null,
                    price float not null)DEFAULT CHARSET=utf8;
                    '''
    
        try:
            A = cur.execute(sqlc)
            conn.commit()
            print('成功')
        except:
            print("错误")
        for g in itl:
            count = count + 1
            b=tplt.format(count, g[0], g[1])
    
    
    
            sqla = '''
            insert into  coffee(name,price)
            values(%s,%s);
           '''
            try:
                B = cur.execute(sqla,(g[1],g[0]))
                conn.commit()
                print('成功')
            except:
                print("错误")
    
            # save_path = 'D:/taobao.txt'
            # f=open(save_path,'a')
            #
            # f.write(b+'
    ')
            # f.close()
    
        conn.commit()
        cur.close()
        conn.close()
    
    
    def main():
        goods="咖啡"
        depth =2
        start_url='https://s.taobao.com/search?q='+goods
        List =[]
        for i in range(depth):
            try:
                url =start_url +"&s="+ str(i*44)
                html=getHTMLtext(url)
                getpage(List,html)
            except:
               continue
    
    
        print(printgoods(List))
        # savefiles(data)
    
    
    
    
    main()
    淘宝信息采集+保存到Mysql数据库

    6.Ajax请求,模范代码

    import requests
    import logging
    import json
    from os import makedirs
    from os.path import exists
    
    # 总页数 
    TOTAL_PAGE = 2
    # 文件存放位置
    RESULTS_DIR = 'results'
    exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
    # 日志配置
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
    # 起始url
    INDEX_URL = 'https://dynamic1.scrape.center/api/movie/?limit={limit}&offset={offset}'
    # 页数
    LIMIT = 2
    # 详情页url
    DETAIL_URL = 'https://dynamic1.scrape.center/api/movie/{id}'
    
    
    def scrape_api(url):
        logging.info('scraping %s...', url)
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response.json()
            logging.error('get invalid status code %s while scraping %s', response.status_code, url)
        except requests.RequestException:
            logging.error('error occurred while scraping %s', url, exc_info=True)
    
    
    def scrape_index(page):
        url = INDEX_URL.format(limit=LIMIT, offset=LIMIT * (page - 1))
        return scrape_api(url)
    
    
    def scrape_detail(id):
        url = DETAIL_URL.format(id=id)
        return scrape_api(url)
    
    
    def save_data(data):
        name = data.get('name')
        data_path = f'{RESULTS_DIR}/{name}.json'
        json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
    
    
    def main():
        for page in range(1, TOTAL_PAGE + 1):
            index_data = scrape_index(page)
            for item in index_data.get('results'):
                id = item.get('id')
                detail_data = scrape_detail(id)
                logging.info('detail data %s', detail_data)
                save_data(detail_data)
    
    
    if __name__ == '__main__':
        main()
    Ajax请求,模范代码
  • 相关阅读:
    清除缓冲
    iOS 10 开发 相机相关的适配
    UIImagePickerControllerDelegate---ActionSheet---获得设备型号
    4,4s屏幕过渡页处理
    iOS小技巧3
    UIWebView中Html中用JS调用OC方法及OC执行JS代码
    删除本地文件
    NSArray倒序
    tableView和scrollView滚动起冲突
    在UIScrollView的delegate方法判断滚动快慢
  • 原文地址:https://www.cnblogs.com/hellangels333/p/8621368.html
Copyright © 2011-2022 走看看