zoukankan      html  css  js  c++  java
  • Datawhale-新闻文本分类-task3-机器学习分类

    import pandas as pd
    train = pd.read_csv(r'./train_set.csv', sep='	')
    test_a = pd.read_csv(r'./test_a.csv', sep='	')
    
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import RidgeClassifier
    def do_nothing_tfidf(train_data):
        import time
        start = time.time()
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfidf = TfidfVectorizer()
        X = tfidf.fit_transform(train['text'])
        y = train['label']
        x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=seed)
        clf = RidgeClassifier()
        clf.fit(x_train, y_train)
        print(clf.score(x_train,  y_train))
        y_pre = clf.predict(x_valid)
        print(f1_score(y_valid, y_pre, average='macro'))
        print('spent 【%d】s'%(int(time.time() - start)))
    
    def set_ngram_tfidf(train_data):
        import time
        start = time.time()
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfidf = TfidfVectorizer(ngram_range=(1,3))
        X = tfidf.fit_transform(train['text'])
        y = train['label']
        x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=seed)
        clf = RidgeClassifier()
        clf.fit(x_train, y_train)
        print(clf.score(x_train,  y_train))
        y_pre = clf.predict(x_valid)
        print(f1_score(y_valid, y_pre, average='macro'))
        print('spent 【%d】s'%(int(time.time() - start)))
    
    def set_ngram_max_feature_tfidf(train_data):
        import time
        start = time.time()
        from sklearn.feature_extraction.text import TfidfVectorizer
        tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=5000)
        X = tfidf.fit_transform(train['text'])
        y = train['label']
        x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=seed)
        clf = RidgeClassifier()
        clf.fit(x_train, y_train)
        print(clf.score(x_train,  y_train))
        y_pre = clf.predict(x_valid)
        print(f1_score(y_valid, y_pre, average='macro'))
        print('spent 【%d】s'%(int(time.time() - start)))
        
    print('=============do nothing tfidf……================')
    do_nothing_tfidf(train)
    print('=============just do ngram tfidf……=============')
    set_ngram_tfidf(train)
    print('=============both do ngram and max_feature……============')
    set_ngram_max_feature_tfidf(train)
    
    

    ![
    ]

  • 相关阅读:
    云原生时代,应用架构将如何演进?
    OpenKruise:解放 DaemonSet 运维之路
    端应用研发进入云原生时代
    如何通过 Serverless 技术降低微服务应用资源成本?
    让容器应用管理更快更安全,Dragonfly 发布 Nydus 容器镜像加速服务
    一文教会你如何写复杂业务代码
    gitalk未找到相关的 Issues 进行评论解决方案
    dpkg error processing package XXX (--configure) 解决方法 (ubuntu右上角红色警告)
    Linux添加快捷方式
    linux开机部署
  • 原文地址:https://www.cnblogs.com/Alexisbusyblog/p/13378863.html
Copyright © 2011-2022 走看看