简单的搜索引擎
核心思想就是
- 爬取指定页面,提取出页面中的url,进行递归爬取,可以指定递归深度
- 提取网页中的文字内容,根据一定规则进行分词,保存在数据库中,分出的单词和url对应存储
- 对查询参数分词,然后查询数据库中各个单词对应的url,然后返回
对搜索结果进行排名:
- 基于内容对搜索结果进行排序
- 单词频度
- 文档位置
- 单词距离
- 利用外部回指链接排名
- PageRank
- 使用链接文本
- 利用人工神经网络进行排名
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
import sqlite3
import re
# 需要被忽略的单词
ignorewords = set(['the', 'of', 'a', 'and', 'to', 'in', 'is', 'it'])
class crawler:
def __init__(self, dbname):
self.con = sqlite3.connect(dbname)
def __delete__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer,toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
# 分词并转化为小写,这里只是简单的按照非字母非数字分割
def separatewords(self, text):
splitter = re.compile('\W*')
return [s.lower() for s in splitter.split(text) if s != '']
def addtoindex(self, url, soup):
if self.isindexed(url):
return
# 分出页面内的单词
text = self.gettextonly(soup)
words = self.separatewords(text)
# 得到url对应的id
urlid = self.getentryid('urllist', 'url', url)
# 将每个单词和url关联起来
for i in range(len(words)):
word = words[i]
if word in ignorewords:
continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute("insert into wordlocation(urlid, wordid, location) values(%d, %d, %d)"
% (urlid, wordid, i))
def getentryid(self, table, field, value, createnew=True):
cur = self.con.execute("select rowid from %s where %s='%s'" % (table, field, value))
res = cur.fetchone()
if res == None:
cur = self.con.execute("insert into %s(%s) values('%s')" % (table, field, value))
return cur.lastrowid
else:
return res[0]
def isindexed(self, url):
u = self.con.execute("select rowid from urllist where url='%s'" % url).fetchone()
if u != None:
v = self.con.execute("select * from wordlocation where urlid=%d" % u[0]).fetchone()
if v != None:
return True
else:
return False
# 找到标签内的文字
def gettextonly(self, soup):
v = soup.string
if v == None:
c = soup.contents
resulttext = ''
for t in c:
subtext = self.gettextonly(t)
resulttext += subtext + '
'
return resulttext
else:
return v.strip()
def addlinkref(self,urlFrom,urlTo,linkText):
words=self.separatewords(linkText)
fromid=self.getentryid('urllist','url',urlFrom)
toid=self.getentryid('urllist','url',urlTo)
if fromid==toid: return
cur=self.con.execute("insert into link(fromid,toid) values (%d,%d)" % (fromid,toid))
linkid=cur.lastrowid
for word in words:
if word in ignorewords:
continue
wordid=self.getentryid('wordlist','word',word)
self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid))
def crawl(self, pages, depth=2):
# 爬取深度
for i in range(depth):
newpages = set()
# 依次打开每个page
for page in pages:
try:
c = urllib2.urlopen(page)
except:
print "could not open %s" % page
continue
soup = BeautifulSoup(c)
# 添加index到数据库
self.addtoindex(page, soup)
# 找出所有的链接
links = soup('a')
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http' and not self.isindexed(url):
newpages.add(url)
linktext = self.gettextonly(link)
self.addlinkref(page, url, linktext)
self.dbcommit()
pages = newpages
pages = ["http://www.iplaypy.com/jichu/set.html"]
crawler = crawler('test.db')
crawler.crawl(pages)
#crawler.createindextables()
import sqlite3
import nn
network = nn.searchnet('nn.db')
# 搜索
class searcher:
def __init__(self, dbname):
self.con = sqlite3.connect(dbname)
def __delete__(self):
self.con.close()
def getmatchrows(self, query):
# 构造查询字符串
fieldlist = 'w0.urlid'
tablelist = ''
clauselist = ''
wordids = []
# 根据空格拆分单词
words = query.split(' ')
print words
tablenumber = 0
for word in words:
# 获取单词对应的id
wordrow = self.con.execute("select rowid from wordlist where word='%s'" % word).fetchone()
if wordrow != None:
wordids.append(wordrow[0])
print tablenumber
if tablenumber > 0:
tablelist += ','
clauselist += ' and '
clauselist += 'w%d.urlid=w%d.urlid and ' % (tablenumber-1, tablenumber)
fieldlist += ', w%d.location' % tablenumber
tablelist += 'wordlocation w%d' % tablenumber
clauselist += 'w%d.wordid=%d' % (tablenumber, wordrow[0])
tablenumber += 1
# 拼接查询语句
fullquery = 'select %s from %s where %s' % (fieldlist, tablelist, clauselist)
print fullquery
cur = self.con.execute(fullquery)
rows = [row for row in cur]
return rows, wordids
# 计算每个url对应的比重
def getscoredlist(self, rows, wordids):
totalscores = dict([(row[0], 0) for row in rows])
# 评价函数
weights = [(0.2, self.locationscore(rows)), (0.3, self.frequencyscore(rows)),
(0.1, self.distancescore(rows)), (0.2, self.pagerankscore(rows)),
(0.2, self.linktextscore(rows, wordids))]
for (weight, scores) in weights:
for url in totalscores:
totalscores[url] += weight * scores[url]
return totalscores
def geturlname(self, id):
return self.con.execute('select url from urllist where rowid=%d' % id).fetchone()[0]
# 返回带有评分的结果
def query(self, query):
rows, wordids = self.getmatchrows(query)
scores = self.getscoredlist(rows, wordids)
rankedscores = sorted([(score, url) for (url, score) in scores.items()], reverse=1)
for (score, urlid) in rankedscores[0:10]:
print '%f %s' % (score, self.geturlname(urlid))
# 归一化函数,将各种评价标准进行归一化处理,将score缩放在0-1区间
def nomalizescores(slef, scores, smallisbetter=0):
# 如果被除数是0,则用一个较小的值代替,避免除0
vsmall = 0.00001
if smallisbetter:
minscore = min(scores.values())
return dict([(u, float(minscore) / max(vsmall, l)) for (u, l) in scores.items()])
else:
maxscore = max(scores.values())
if maxscore == 0:
maxscore = vsmall
return dict([(u, float(c) / maxscore) for (u, c) in scores.items()])
# 单词频度,统计出现相同url个数
def frequencyscore(self, rows):
counts = dict([(row[0], 0) for row in rows])
for row in rows:
counts[row[0]] += 1
return self.nomalizescores(counts)
# 文档距离,越靠前的url越有用
def locationscore(self, rows):
# 设置单词位置的上限为1000000
locations = dict([(row[0], 1000000) for row in rows])
for row in rows:
loc = sum(row[1:])
if loc < locations[row[0]]:
locations[row[0]] = loc
return self.nomalizescores(locations, smallisbetter=1)
# 单词距离,两个单词之间的距离越近score越高
def distancescore(self, rows):
# 如果只有一个查询单词,score都一样,就直接返回
if len(rows[0]) <= 2:
return dict([(row[0], 1.0) for row in rows])
mindistance = dict([(row[0], 1000000) for row in rows])
for row in rows:
dist = sum([abs(row[i] -row[i-1]) for i in range(2, len(row))])
if dist < mindistance[row[0]]:
mindistance[[row[0]]] = dist
return self.nomalizescores(mindistance)
# 利用外部连接个数评价——简单计数,计算每个url被链接的次数
def inboundlinkscore(self, rows):
uniqueurls = set(row[0] for row in rows)
inboundcount = dict([(url, self.con.execute('select count(*) from link where toid=%d'
% u).fetchone()[0]) for u in uniqueurls])
return self.nomalizescores(inboundcount)
# PageRank算法,每个url对应的PageRank值
def calculatepagerank(self, iterations=20):
self.con.execute('drop table if exists pagerank')
self.con.execute('create table pagerank(urlid primary key, score)')
# 初始化每个url的rank为1
self.con.execute('insert into pagerank select rowid, 1.0 from urllist')
self.con.commit()
# 更新rank值
for i in range(iterations):
print 'Iterator %d' % i
for (urlid,) in self.con.execute('select rowid from urllist'):
# rank最小值
pr = 0.15
# 遍历所有指向该网页的其他网页
for (linker,) in self.con.execute('select distinct fromid from link where toid=%d' % urlid):
# 得到链接源对应网页的pagerank值
linkingpr = self.con.execute('select score from pagerank where urlid=%d' % linker).fetchone()[0]
# 求出该网页上总的链接数
linkingcount = self.con.execute('select count(*) from link where fromid=%d' % linker).fetchone()[0]
pr += 0.85 * (linkingpr / linkingcount)
# 更新数据库中urlid对应url的pagerank
self.con.execute('update pagerank set score=%f where urlid=%d' % (pr, urlid))
self.con.commit()
# 利用外部回指链接——PageRank
def pagerankscore(self, rows):
pageranks = dict([(row[0], self.con.execute('select score from pagerank where urlid=%d'
% row[0]).fetchone()[0]) for row in rows])
return self.nomalizescores(pageranks)
# 利用链接文本,如果搜索单词出现在指向目标地址的链接文本中,则将该链接的pagerank加到目标链接的pagerank上
def linktextscore(self, rows, wordids):
linkscores = dict([(row[0], 0) for row in rows])
for wordid in wordids:
cur = self.con.execute('select link.fromid, link.toid from linkwords,link where wordid=%d and linkwords.rowid=link.rowid' % wordid)
for (fromid, toid) in cur:
if toid in linkscores:
pr = self.con.execute('select score from pagerank where urlid=%d' % fromid).fetchone()[0]
linkscores[toid] += pr
return self.nomalizescores(linkscores)
def nnscore(self, rows, wordids):
# 获得一个由唯一的urlid构成的有序列表
urlids = [urlid for urlid in set([row[0] for row in rows])]
nnres = network.getresult(wordids, urlids)
scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))])
return self.nomalizescores(scores)
searcher = searcher('test.db')
#searcher.getmatchrows('set Python')
#searcher.calculatepagerank()
searcher.query('set')
['set']
0
select w0.urlid, w0.location from wordlocation w0 where w0.wordid=4
0.798721 http://www.iplaypy.com/jichu/set.html
0.356789 http://www.iplaypy.com/jichu/
0.342161 http://www.iplaypy.com/jichu/var.html
0.341273 http://www.iplaypy.com/jichu/dict.html
0.339879 http://www.iplaypython.com/jichu/dict.html
0.328156 http://www.iplaypy.com/jichu/dir.html
0.328135 http://www.iplaypy.com/jichu/note.html
0.328107 http://www.iplaypy.com/jichu/function.html
0.328074 http://www.iplaypy.com/jichu/int.html
0.328048 http://www.iplaypy.com/jichu/class.html
神经网络
输入 —> 神经网络层(可以包含多层,每一层有多个节点) —> 输出
神经网络层需要大量的输入输出来训练,如果一对输入输出在神经网络网络层没有一个对应的节点,会添加一个,如果有会更新“输入——节点”、“节点——输出”的权重值,这些权重值就是一个矩阵,经过大量数据的训练每个权重值越来越接近真实值
训练过程:根据神经网络中的权重值和输入值计算出输出值,然后对比输出值和给定的输入对应的输出值,重新矫正权重矩阵
from math import tanh
import sqlite3
def dtanh(y):
return 1.0 - y * y
class searchnet:
def __init__(self, dbname):
self.con = sqlite3.connect(dbname)
def __delete__(self):
self.con.close()
# 创建表
def maketables(self):
self.con.execute('create table hiddennode(create_key)')
self.con.execute('create table wordhidden(fromid, toid, strength)')
self.con.execute('create table hiddenurl(fromid, toid, strength)')
self.con.commit()
# 查询strength
def getstrength(self, fromid, toid, layer):
if layer == 0:
tablename = 'wordhidden'
else:
tablename = 'hiddenurl'
res = self.con.execute('select strength from %s where fromid=%d and toid=%d' % (tablename, fromid, toid)).fetchone()
if res == None:
if layer == 0:
return -0.2
if layer == 1:
return 0
return res[0]
# 更新连接strength或者新建链接
def setstrength(self, fromid, toid, layer, strength):
if layer == 0:
tablename = 'wordhidden'
else:
tablename = 'hiddenurl'
res = self.con.execute('select strength from %s where fromid=%d and toid=%d' % (tablename, fromid, toid)).fetchone()
if res == None:
self.con.execute('insert into %s (fromid, toid, strength) values(%d, %d, %s)' % (tablename, fromid, toid, strength))
else:
self.con.execute('update %s set strength=%f where rowid=%d' % (tablename, strength, res[0]))
# 创建hiddennode
def generatehiddennode(self, wordids, urls):
if len(wordids) > 3:
return None
create_key = '_'.join(sorted([str(wordid) for wordid in wordids]))
res = self.con.execute("select rowid from hiddennode where create_key='%s'" % create_key).fetchone()
# 如果没有该node则新建
if res == None:
cur = self.con.execute("insert into hiddennode (create_key) values('%s')" % create_key)
hiddenid = cur.lastrowid
# 设置输入端默认权重
for wordid in wordids:
self.setstrength(wordid, hiddenid, 0, 1.0/len(wordids))
# 设置输出端默认权重
for url in urls:
self.setstrength(hiddenid, url, 1, 0.1)
self.con.commit()
# 获取所有的hiddenid
def getallhiddenids(self, wordids, urlids):
ll = {}
for wordid in wordids:
cur = self.con.execute('select rowid from wordhidden where fromid=%d' % wordid)
for row in cur:
ll[row[0]] = 1
for urlid in urlids:
cur = self.con.execute('select rowid from hiddenurl where toid=%d' % urlid)
for row in cur:
ll[row[0]] = 1
return ll.keys()
# 创建神经网络
def setupnetwork(self, wordids, urlids):
# 值列表
self.wordids = wordids
self.hiddenids = self.getallhiddenids(wordids, urlids)
self.urlids = urlids
# 节点输出
self.ai = [1.0] * len(self.wordids)
self.ah = [1.0] * len(self.hiddenids)
self.ao = [1.0] * len(self.urlids)
print self.ao
# 建立权重矩阵
self.wi = [[self.getstrength(wordid, hiddenid, 0) for hiddenid in self.hiddenids]
for wordid in self.wordids]
self.wo = [[self.getstrength(hiddenid, urlid, 1) for urlid in self.urlids]
for hiddenid in self.hiddenids]
# 前馈算法
def feedforward(self):
# 查询单词作为输入
for i in range(len(self.wordids)):
self.ai[i] = 1.0
# hidden层节点的活跃程度
for j in range(len(self.hiddenids)):
summ = 0.0
for i in range(len(self.wordids)):
summ += self.ai[i] * self.wi[i][j]
self.ah[j] = tanh(summ)
# 输出层节点的活跃程度
for k in range(len(self.urlids)):
summ = 0.0
for i in range(len(self.hiddenids)):
summ += self.ah[i] * self.wo[i][k]
self.ao[k] = tanh(summ)
return self.ao[:]
def getresult(self, wordids, urlids):
self.setupnetwork(wordids, urlids)
return self.feedforward()
# 反向传播算法
def backpropagete(self, targets, N=0.5):
# 计算输出层误差
output_deltas = [0.0] * len(self.urlids)
for k in range(len(self.urlids)):
error = targets[k] - self.ao[k]
output_deltas[k] = dtanh(self.ao[k]) * error
# 计算隐藏层误差
hidden_deltas = [0.0] * len(self.hiddenids)
for j in range(len(self.hiddenids)):
error = 0.0
for k in range(len(self.urlids)):
error = error + output_deltas[k] * self.wo[j][k]
hidden_deltas[j] = dtanh(self.ah[j]) * error
# 更新输出权重
for j in range(len(self.hiddenids)):
for k in range(len(self.urlids)):
change = output_deltas[k] * self.ah[j]
self.wo[j][k] += N * change
# 更新输入权重
for i in range(len(self.wordids)):
for k in range(len(self.hiddenids)):
change = hidden_deltas[k] * self.ai[i]
self.wi[i][k] += N * change
# 训练神经网络
def trainquery(self, wordids, urlids, selectedurl):
# 生成一个隐藏节点
self.generatehiddennode(wordids, urlids)
self.setupnetwork(wordids, urlids)
self.feedforward()
targets = [0.0] * len(urlids)
targets[urlids.index(selectedurl)] = 1.0
self.backpropagete(targets)
self.updatedatabase()
# 更新数据库中的权重值
def updatedatabase(self):
for i in range(len(self.wordids)):
for j in range(len(self.hiddenids)):
self.setstrength(self.wordids[i], self.hiddenids[j], 0, self.wi[i][j])
for j in range(len(self.hiddenids)):
for k in range(len(self.urlids)):
self.setstrength(self.hiddenids[j], self.urlids[k], 1, self.wo[j][k])
self.con.commit()
net = searchnet('nn.db')
#net.maketables()
wWorld, wRiver, wBank = 101, 102, 103
uWorldBank, uRiver, uEarth = 201, 202, 203
#net.generatehiddennode([wWorld, wBank], [uWorldBank, uRiver, uEarth])
#for c in net.con.execute('select * from wordhidden'):
# print c
#for c in net.con.execute('select * from hiddenurl'):
# print c
net.trainquery([wWorld, wBank], [uWorldBank, uRiver, uEarth], uWorldBank)
net.getresult([wWorld, wBank], [uWorldBank, uRiver, uEarth])
[1.0, 1.0, 1.0]
[1.0, 1.0, 1.0]
[0.7775224145252707, -0.0110282659654087, -0.0110282659654087]