zoukankan      html  css  js  c++  java
  • Go读取论文并转换为simhahs

    package main
    
    import (
    	"fmt"
    	_"flag"
    	_ "os"
    	_ "io/ioutil"
    	_"strings"
    	_ "path"
    	 "log"
    	_ "baliance.com/gooxml/document"
    	"database/sql"
    	_ "github.com/go-sql-driver/mysql"
    	"time"
    	"github.com/yanyiwu/gosimhash"
    	
    ) 
    
    
    func main(){
    
    
    
    	t1 := time.Now()
    
    	Mylog(doc)
    	if err != nil {
    		Mylog(err)
    	}
    
        db, err := sql.Open("mysql", "root:123456@tcp(127.0.0.1:3306)/gzpg_crs_jsj?charset=utf8");
        if err != nil {
            fmt.Println(err);
        }
    	sql :="select s1.paper_id,s2.title_cn,s2.abstract_cn,s2.keyword_cn,s2.title_en,s2.abstract_en,s2.keyword_en,s1.s_content from sf_content s1,sf_paper s2 where  s1.paper_id=s2.paper_id limit 10"
    	rows, err := db.Query(sql)
        if err != nil {
    		fmt.Println(err);
    	}
    	stmt, err := db.Prepare("INSERT  sim_path SET paperid=?,simcode=?")
    	if err != nil {
    		fmt.Println(err);
    	}
    
    	var str string
    	var code string
    	//查询多个
        for rows.Next() {
    		var paper_id int //论文id
    		var title_cn string //中文题目
    		var abstract_cn string //中文摘要
    		var keyword_cn string //中文关键词
    		var title_en string //英文题目
    		var abstract_en string //英文摘要
    		var keyword_en string //英文关键词
    		var s_content string//全文内容
    		
            err = rows.Scan(&paper_id, &title_cn,&abstract_cn,&keyword_cn,&title_en,&abstract_en,&keyword_en,&s_content)
    		str = fmt.Sprintf("%s
     摘要:%s
     关键词:%s
     %s
     Abstract:%s
     Keywords:%s
     %s
    ",title_cn,abstract_cn,keyword_cn,title_en,abstract_en,keyword_en,s_content)
    		code=simhash(str)
    		res, err := stmt.Exec(paper_id, code)
    		if err != nil {
    			fmt.Println(err);
    		}
    		id, err := res.LastInsertId()
    		if err != nil {
    			fmt.Println(err);
    		}
    		fmt.Print("%s成功%s 
    ",id,paper_id);
    	
    	}
    	db.Close()
    	elapsed := time.Since(t1)
    	log.Println("时间花费位:
    " , elapsed)
    
    }
    
    func simhash(str string) (string) {
    
    	hasher := gosimhash.New("../dict/jieba.dict.utf8", "../dict/hmm_model.utf8", "../dict/idf.utf8", "../dict/stop_words.utf8")
    	defer hasher.Free()
    	fingerprint := hasher.MakeSimhash(str, 1)
    	var code string
    	var s string = "0000000000000000000000000000000000000000000000000000000000000000"
    	bs := []byte(s)
    						
        for i := 63; i >= 0; i-- {
    		
    		if (fingerprint&1)==1 {
    
    			bs[i]='1'
    		} else {
    
    			bs[i]='0'
    		}
    		fingerprint >>=1
    	}
    	code =string(bs)
    	return code
    }
    
    
    func Mylog(v ...interface{}) {
        f, err := os.OpenFile("20181105go.log", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
    	if err != nil {
    		Mylog(err)
    	}
        defer f.Close()
        logger := log.New(f, TAG, log.Ldate|log.Ltime|log.Lmicroseconds)
        logger.Println(v...)
    }
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
  • 相关阅读:
    Educational Codeforces Round 86 (Rated for Div. 2) D. Multiple Testcases
    Educational Codeforces Round 86 (Rated for Div. 2) C. Yet Another Counting Problem
    HDU
    HDU
    HDU
    HDU
    Good Bye 2019 C. Make Good (异或的使用)
    Educational Codeforces Round 78 (Rated for Div. 2) C. Berry Jam
    codeforces 909C. Python Indentation
    codeforces1054 C. Candies Distribution
  • 原文地址:https://www.cnblogs.com/mengluo/p/9915440.html
Copyright © 2011-2022 走看看