最近看天涯上的连续小说, 感到有些地方很不方便,比如: 作者写了一篇后, 后面很多人回复, 然后作者在写, 别人再回复, 导致看一篇完整的文章时, 需要自己可能得翻很多次页才能看完. 于是编写了这个python脚本, 只要用户输入帖子首页的网址, 和作者名(如果不输入,就认为是第一篇文章的作者), 就能将该帖子中该作者写的内容全部提取出来
# -*- coding: mbcs -*-
#import re,urllib, urllib2, cookielib,datetime,sys
import urllib2,sys,os
def getUrlContent(url):
ifile = urllib2.urlopen(url)
data=ifile.read()
ifile.close()
return data
def getNextPageUrl(cont):
p0=0
res=[]
while 1:
p1 = cont.find("<table border=0>",p0)
if p1<0:break
p2 = cont.find('<font color=black>共',p1)
if p2<0:break
p3 = cont.find('><font color=#246cae>下一页',p2)
if p3<0:break
tmpl='<a href='
p4 = cont.rfind(tmpl,p2,p3)
if p4<0:break
p0=p4
res.append(cont[p4+len(tmpl):p3])
return res[0]
return None
def getAuthor(cont):
p1 = cont.find("<TABLE")
if p1<0:return None
p1=cont.find('>作者:<a', p1)
if p1<0:return None
p1=cont.find("vwriter=", p1)
if p1<0:return None
s1="target=_blank>"
p1=cont.find(s1, p1)
if p1<0:return None
s2="</a>"
p2=cont.find(s2, p1)
if p2<0:return None
return cont[p1+len(s1):p2]
def getTitle(cont):
s1='<TITLE>'
p1=cont.find(s1)
if p1<0:return None
p2=cont.find('</TITLE>',p1)
if p2<0:return None
return cont[p1+len(s1):p2]
def getByAuthor(cont,author):
p0=0
res=[]
while 1:
p1 = cont.find("<TABLE",p0)
#print 'p1',p1
if p1<0:break
p2=cont.find("vwriter=", p1)
if p2<0:break
p2=cont.find(">"+author+"</a>", p2)
#print 'p2',p2
if p2<0:break
p3=cont.find("</table>", p2)
#print 'p3',p3
if p3<0:break
p4=cont.find("<TABLE", p3)
#print 'p4',p4
if p4<0:
p4=cont.find("<!-- google_ad_section_end -->", p3)
assert(p4>0)
res.append(cont[p3+8:p4:])
break
else:
p0=p4
res.append(cont[p3+8:p4:])
return res
#url=sys.argv[1]
def mainProg(url):
fp=None
author=""
while 1:
print url
cont=getUrlContent(url)
print 'down OK'
if len(author)==0:
author=getAuthor(cont)
title = getTitle(cont)
print 'author:',author,'title:',title
title=title.replace('/','x').replace('\\','x').replace(':','x').replace('*','x').replace('?','x')
file=title+'.htm'
if os.path.isfile(file):
print "File already exists!"
return
fp=open(file,'w')
res=getByAuthor(cont,author)
print 'parse ok',len(res)
fp.writelines([url+'<br>\n','<br>--------<br>'.join(res)])
url=getNextPageUrl(cont)
if url is None:
break
while 1:
url=raw_input('input url:')
mainProg(url)
#import re,urllib, urllib2, cookielib,datetime,sys
import urllib2,sys,os
def getUrlContent(url):
ifile = urllib2.urlopen(url)
data=ifile.read()
ifile.close()
return data
def getNextPageUrl(cont):
p0=0
res=[]
while 1:
p1 = cont.find("<table border=0>",p0)
if p1<0:break
p2 = cont.find('<font color=black>共',p1)
if p2<0:break
p3 = cont.find('><font color=#246cae>下一页',p2)
if p3<0:break
tmpl='<a href='
p4 = cont.rfind(tmpl,p2,p3)
if p4<0:break
p0=p4
res.append(cont[p4+len(tmpl):p3])
return res[0]
return None
def getAuthor(cont):
p1 = cont.find("<TABLE")
if p1<0:return None
p1=cont.find('>作者:<a', p1)
if p1<0:return None
p1=cont.find("vwriter=", p1)
if p1<0:return None
s1="target=_blank>"
p1=cont.find(s1, p1)
if p1<0:return None
s2="</a>"
p2=cont.find(s2, p1)
if p2<0:return None
return cont[p1+len(s1):p2]
def getTitle(cont):
s1='<TITLE>'
p1=cont.find(s1)
if p1<0:return None
p2=cont.find('</TITLE>',p1)
if p2<0:return None
return cont[p1+len(s1):p2]
def getByAuthor(cont,author):
p0=0
res=[]
while 1:
p1 = cont.find("<TABLE",p0)
#print 'p1',p1
if p1<0:break
p2=cont.find("vwriter=", p1)
if p2<0:break
p2=cont.find(">"+author+"</a>", p2)
#print 'p2',p2
if p2<0:break
p3=cont.find("</table>", p2)
#print 'p3',p3
if p3<0:break
p4=cont.find("<TABLE", p3)
#print 'p4',p4
if p4<0:
p4=cont.find("<!-- google_ad_section_end -->", p3)
assert(p4>0)
res.append(cont[p3+8:p4:])
break
else:
p0=p4
res.append(cont[p3+8:p4:])
return res
#url=sys.argv[1]
def mainProg(url):
fp=None
author=""
while 1:
print url
cont=getUrlContent(url)
print 'down OK'
if len(author)==0:
author=getAuthor(cont)
title = getTitle(cont)
print 'author:',author,'title:',title
title=title.replace('/','x').replace('\\','x').replace(':','x').replace('*','x').replace('?','x')
file=title+'.htm'
if os.path.isfile(file):
print "File already exists!"
return
fp=open(file,'w')
res=getByAuthor(cont,author)
print 'parse ok',len(res)
fp.writelines([url+'<br>\n','<br>--------<br>'.join(res)])
url=getNextPageUrl(cont)
if url is None:
break
while 1:
url=raw_input('input url:')
mainProg(url)