1.基于幽灵蛛pholcus开源项目的规则
直接贴代码,代码可以更改后用于其他爬虫项目
package pholcus_lib
// 基础包
import (
// "github.com/henrylee2cn/pholcus/common/goquery" //DOM解析
"github.com/henrylee2cn/pholcus/app/downloader/request" //必需
. "github.com/henrylee2cn/pholcus/app/spider" //必需
// . "github.com/henrylee2cn/pholcus/app/spider/common" //选用
// "github.com/henrylee2cn/pholcus/logs"
// net包
// "net/http" //设置http.Header
// "net/url"
// 编码包
// "encoding/xml"
//"encoding/json"
// 字符串处理包
//"regexp"
// "strconv"
// "fmt"
// "math"
//"net/http"
"strconv"
"regexp"
"fmt"
"encoding/json"
"net/url"
//"strings"
//"strings"
"strings"
"github.com/henrylee2cn/pholcus/common/goquery"
//"net/http"
)
////获取unix时间
var millisecond int64
//用户名
var name string
//密码
var password string
//解析json结构体
type (
Info struct {
Retcode int
Uid string
Nick string
CrossDomainUrlList []string
}
)
func init() {
FileTest.Register()
millisecond = getMillisecond()
name="88888888"
password="8888888"
name = encryptUname(name)
}
var FileTest = &Spider{
Name: "微博登录测试",
Description: "测试 [s.weibo.com/user/]",
Pausetime: 1500,
Keyin: KEYIN,
// Limit: LIMIT,
EnableCookie: true,
RuleTree: &RuleTree{
Root: func(ctx *Context) {
//https://weibo.cn/
ctx.AddQueue(&request.Request{
Url: "https://login.sina.com.cn/sso/prelogin.php?entry=account&callback=sinaSSOController.preloginCallBack&su="+name+"&rsakt=mod&client=ssologin.js(v1.4.15)&_="+strconv.FormatInt(millisecond,10),
Rule: "登录一",
//DownloaderID:1,
})
},
Trunk: map[string]*Rule{
"登录一": {
ParseFunc: func(ctx *Context) {
str := ctx.GetText()
println("-----1-----" + str)
compile1, _ := regexp.Compile("{.*}")
match1 := compile1.FindString(str)
fmt.Println(match1)
//json str 转map
var dat map[string]interface{}
if err := json.Unmarshal([]byte(match1), &dat); err == nil {
if err != nil{
println("转换异常!")
}
}
servertime := dat["servertime"]
servertime= strconv.FormatFloat(servertime.(float64), 'f', -1, 64)
nonce:=dat["nonce"]
pubkey:=dat["pubkey"]
rsakv := dat["rsakv"]
//加密密码
ep := encryptPassword(pubkey.(string), servertime.(string), nonce.(string), password)
postDict := map[string]string{}
postDict["entry"] = "account"
postDict["gateway"] = "1"
postDict["from"] = ""
postDict["savestate"] = "30"
postDict["qrcode_flag"] = "true"
postDict["useticket"] = "0"
postDict["pagerefer"] = ""
postDict["vsnf"] = "1"
postDict["su"] = name
postDict["service"] = "account"
postDict["servertime"] = servertime.(string)
postDict["nonce"] = nonce.(string)
postDict["pwencode"] = "rsa2"
postDict["rsakv"] = rsakv.(string)
postDict["sp"] = ep
postDict["sr"] = "1395*822"
postDict["cdult"] = "3"
postDict["domain"] = "sina.com.cn"
postDict["prelt"] = "170"
postDict["returntype"] = "TEXT"
postValues := url.Values{}
for postKey, PostValue := range postDict{
postValues.Set(postKey, PostValue)
}
//post参数编码
postDataStr := postValues.Encode()
ctx.AddQueue(&request.Request{
Url: "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)&_="+strconv.FormatInt(getMillisecond(),10),
Method: "POST",
EnableCookie: true,
PostData: postDataStr,
Rule: "登录二",
//DownloaderID:1,
})
},
},
"登录二": {
ParseFunc: func(ctx *Context) {
str := ctx.GetText()
println("-----2-----" + str)
var dat Info
json.Unmarshal([]byte(str), &dat)
//此处获取2个链接,包含普通版和移动版
//print(dat.CrossDomainUrlList[2])
ctx.AddQueue(&request.Request{
Url: dat.CrossDomainUrlList[2],
Method: "GET",
EnableCookie: true,
Rule: "登录三",
})
},
},
"登录三": {
ParseFunc: func(ctx *Context) {
ctx.AddQueue(&request.Request{
Url: "https://weibo.cn/",
Method: "GET",
EnableCookie: true,
Rule: "重定向一",
})
},
},
"重定向一": {
ParseFunc: func(ctx *Context) {
compile2, _ := regexp.Compile("[a-zA-z]+://[^\s]*")
string := compile2.FindAllString(ctx.GetText(), 2)
ctx.AddQueue(&request.Request{
Url: string[1],
Method: "GET",
EnableCookie: true,
Rule: "重定向二",
})
},
},
"重定向二": {
ParseFunc: func(ctx *Context) {
compile2, _ := regexp.Compile("[a-zA-z]+://[^\s]*")
string := compile2.FindAllString(ctx.GetText(), 3)
ctx.AddQueue(&request.Request{
Url: string[2],
Method: "GET",
EnableCookie: true,
Rule: "进入首页",
})
},
},
"进入首页": {
ParseFunc: func(ctx *Context) {
for z := 1;z<=2;z++{
ctx.AddQueue(&request.Request{
Url: "https://weibo.cn/search/user/?keyword="+ ctx.GetKeyin() + "&page=" + strconv.Itoa(z),// //,
Rule: "查找微博",
Method: "GET",
EnableCookie: true,
//PostData:"keyword=财经&suser=找人",
//DownloaderID:1,smblog
})
}
},
},
"查找微博": {
ParseFunc: func(ctx *Context) {
println("---------------查找微博-------------")
query := ctx.GetDom()
navBox := query.Find("table")
navBox.Each(func(i int, s *goquery.Selection) {
str := s.Find("tr").Text()
j := strings.LastIndex(str,"粉丝")
z := strings.LastIndex(str,"人")
//昵称
name := str[0:j]
//粉丝数
fansNum := str[j+6:z]
//地区
city := str[z+5:len(str)]
println("name" + name)
//链接
if url, ok := s.Find("table tr td a").Attr("href"); ok {
ctx.AddQueue(&request.Request{
Url: "https://weibo.cn" + url,
Rule: "博主首页",
Temp: map[string]interface{}{
"name": name,
"fansNum": fansNum,
"city": city,
},
})
}
})
},
},
"博主首页": {
ParseFunc: func(ctx *Context) {
//昵称
name := ctx.GetTemp("name","").(string)
//粉丝数
fansNum := ctx.GetTemp("fansNum","").(string)
//地区
city := ctx.GetTemp("city","").(string)
//微博数
weiboNum := ctx.GetDom().Find(".tc").Text()
j := strings.LastIndex(weiboNum,"[")
z := strings.LastIndex(weiboNum,"]")
weiboNum = weiboNum[j+1:z]
//关注数
attentionNum := ctx.GetDom().Find(".tip2 a").Eq(0).Text()
j = strings.LastIndex(attentionNum,"[")
z = strings.LastIndex(attentionNum,"]")
attentionNum = attentionNum[j+1:z]
a :=ctx.GetDom().Find(".ut a").Eq(1)
if a.Text() == "加关注"{
if url, ok := ctx.GetDom().Find(".ut a").Eq(3).Attr("href"); ok {
ctx.AddQueue(&request.Request{
Url: "https://weibo.cn" + url,
Rule: "资料页",
EnableCookie: true,
Temp: map[string]interface{}{
"name": name,
"fansNum": fansNum,
"city": city,
"weiboNum": weiboNum,
"attentionNum": attentionNum,
},
})
}
} else{
if url, ok := ctx.GetDom().Find(".ut a").Eq(2).Attr("href"); ok {
ctx.AddQueue(&request.Request{
Url: "https://weibo.cn" + url,
Rule: "资料页",
EnableCookie: true,
Temp: map[string]interface{}{
"name": name,
"fansNum": fansNum,
"city": city,
"weiboNum": weiboNum,
"attentionNum": attentionNum,
},
})
}
}
},
},
"资料页": {
ItemFields: []string{
"昵称",
"粉丝数",
"地区",
"微博数",
"关注数",
"标签",
"详细信息",
},
ParseFunc: func(ctx *Context) {
//昵称
name := ctx.GetTemp("name","").(string)
//粉丝数
fansNum := ctx.GetTemp("fansNum","").(string)
//地区
city := ctx.GetTemp("city","").(string)
//微博数
weiboNum := ctx.GetTemp("weiboNum","").(string)
//关注数
attentionNum := ctx.GetTemp("attentionNum","").(string)
str := ctx.GetDom().Find("div").Eq(5).Text()
i := strings.LastIndex(str,"标签")
z := strings.LastIndex(str,"更多")
var str2,str3 string
if i == -1{
str2 = ""
str3 = str
}else{
//标签
str2 = str[i+7:z]
//详细信息
str3 = str[0:i]
}
ctx.Output(map[int]interface{}{
0: name,
1: fansNum,
2: city,
3: weiboNum,
4: attentionNum,
5: str2,
6: str3,
})
},
},
},
},
}
2.相关方法
//获取unix时间
func getMillisecond() int64{
MS := time.Now().UnixNano()/1000
return MS
}
//用户名base64加密
func encryptUname(uname string) string { // 获取username base64加密后的结果
//println(base64.RawURLEncoding.EncodeToString([]byte(uname)))
return base64.URLEncoding.EncodeToString([]byte(uname))
}
//密码加密
//把字符串转换bigint
func string2big(s string) *big.Int {
ret := new(big.Int)
ret.SetString(s, 16) // 将字符串转换成16进制
return ret
}
func encryptPassword(pubkey string,servertime string,nonce string, password string) string{
pub := rsa.PublicKey{
N: string2big(pubkey),
E: 65537, // 10001是十六进制数,65537是它的十进制表示
}
// servertime、nonce之间加 ,然后在
,和password拼接
encryString := servertime + " " + nonce + "
" + password
// 拼接字符串加密
encryResult, _ := rsa.EncryptPKCS1v15(rand.Reader, &pub, []byte(encryString))
return hex.EncodeToString(encryResult)
}