zoukankan      html  css  js  c++  java
  • Go Pentester

    Parsing Document Metadata with Bing Scaping

    Set up the environment - install goquery package.

    https://github.com/PuerkitoBio/goquery

    go get github.com/PuerkitoBio/goquery

     Modify the Proxy setting if in China. Refer to: https://sum.golang.org/

     Unzip an Office file and analyze the Open XML file struct. "creator", "lastModifiedBy" in core.xml and "Application", "Company", "AppVersion" in app.xml are of primary interest.

     Defining the metadata Package and mapping the data to structs in GO to open, parse, and extract Office Open XML documents.

    package metadata
    
    import (
    	"archive/zip"
    	"encoding/xml"
    	"strings"
    )
    
    // Open XML type definition and version mapping
    type OfficeCoreProperty struct {
    	XMLName        xml.Name `xml:"coreProperties"`
    	Creator        string   `xml:"creator"`
    	LastModifiedBy string   `xml:"lastModifiedBy"`
    }
    
    type OfficeAppProperty struct {
    	XMLName     xml.Name `xml:"Properties"`
    	Application string   `xml:"Application"`
    	Company     string   `xml:"Company"`
    	Version     string   `xml:"AppVersion"`
    }
    
    var OfficeVersion = map[string]string{
    	"16": "2016",
    	"15": "2013",
    	"14": "2010",
    	"12": "2007",
    	"11": "2003",
    }
    
    func (a *OfficeAppProperty) GetMajorVersion() string {
    	tokens := strings.Split(a.Version, ".")
    
    	if len(tokens) < 2 {
    		return "Unknown"
    	}
    	v, ok := OfficeVersion[tokens[0]]
    	if !ok {
    		return "Unknown"
    	}
    	return v
    }
    
    // Processing Open XML archives and embedded XML documents
    func NewProperties(r *zip.Reader) (*OfficeCoreProperty, *OfficeAppProperty, error) {
    	var coreProps OfficeCoreProperty
    	var appProps OfficeAppProperty
    
    	for _, f := range r.File {
    		switch f.Name {
    		case "docProps/core.xml":
    			if err := process(f, &coreProps); err != nil {
    				return nil, nil, err
    			}
    		case "docProps/app.xml":
    			if err := process(f, &appProps); err != nil {
    				return nil, nil, err
    			}
    		default:
    			continue
    		}
    	}
    	return &coreProps, &appProps, nil
    }
    
    func process(f *zip.File, prop interface{}) error {
    	rc, err := f.Open()
    	if err != nil {
    		return err
    	}
    	defer rc.Close()
    
    	if err := xml.NewDecoder(rc).Decode(&prop); err != nil {
    		return err
    	}
    
    	return nil
    }
    

    Figure out how to search for and retrieve files by using Bing.

    1. Submit a search request to Bing with proper filters to retrieve targeted results.

    2. Scrape the HTML response, extracting the HRER(link) data to obtain direct URLs for documents.

    3. Submit an HTTP request for each direct document URL.

    4. Parse the response body to create a zip.Reader.

    5. Pass the zip.Reader into the code you already developed to extract metadata.

    Analyze the search result elements in Bing.

     Now scrap Bing results and parse the document metadata.

    package metadata
    
    import (
    	"archive/zip"
    	"encoding/xml"
    	"strings"
    )
    
    // Open XML type definition and version mapping
    type OfficeCoreProperty struct {
    	XMLName        xml.Name `xml:"coreProperties"`
    	Creator        string   `xml:"creator"`
    	LastModifiedBy string   `xml:"lastModifiedBy"`
    }
    
    type OfficeAppProperty struct {
    	XMLName     xml.Name `xml:"Properties"`
    	Application string   `xml:"Application"`
    	Company     string   `xml:"Company"`
    	Version     string   `xml:"AppVersion"`
    }
    
    var OfficeVersion = map[string]string{
    	"16": "2016",
    	"15": "2013",
    	"14": "2010",
    	"12": "2007",
    	"11": "2003",
    }
    
    func (a *OfficeAppProperty) GetMajorVersion() string {
    	tokens := strings.Split(a.Version, ".")
    
    	if len(tokens) < 2 {
    		return "Unknown"
    	}
    	v, ok := OfficeVersion[tokens[0]]
    	if !ok {
    		return "Unknown"
    	}
    	return v
    }
    
    // Processing Open XML archives and embedded XML documents
    func NewProperties(r *zip.Reader) (*OfficeCoreProperty, *OfficeAppProperty, error) {
    	var coreProps OfficeCoreProperty
    	var appProps OfficeAppProperty
    
    	for _, f := range r.File {
    		switch f.Name {
    		case "docProps/core.xml":
    			if err := process(f, &coreProps); err != nil {
    				return nil, nil, err
    			}
    		case "docProps/app.xml":
    			if err := process(f, &appProps); err != nil {
    				return nil, nil, err
    			}
    		default:
    			continue
    		}
    	}
    	return &coreProps, &appProps, nil
    }
    
    func process(f *zip.File, prop interface{}) error {
    	rc, err := f.Open()
    	if err != nil {
    		return err
    	}
    	defer rc.Close()
    
    	if err := xml.NewDecoder(rc).Decode(&prop); err != nil {
    		return err
    	}
    
    	return nil
    }
    

    相信未来 - 该面对的绝不逃避,该执著的永不怨悔,该舍弃的不再留念,该珍惜的好好把握。
  • 相关阅读:
    如何在原生微信小程序中实现数据双向绑定
    【推荐】开源项目minapp-重新定义微信小程序的开发
    iKcamp|基于Koa2搭建Node.js实战(含视频)☞ 规范与部署
    iKcamp|基于Koa2搭建Node.js实战(含视频)☞ 错误处理
    系列3|走进Node.js之多进程模型
    手把手教你撸一个 Webpack Loader
    iKcamp|基于Koa2搭建Node.js实战(含视频)☞ 记录日志
    React Native 网络层分析
    如何实现VM框架中的数据绑定
    iKcamp|基于Koa2搭建Node.js实战(含视频)☞ 解析JSON
  • 原文地址:https://www.cnblogs.com/keepmoving1113/p/12391846.html
Copyright © 2011-2022 走看看