#coding:utf-8 import urllib2 import sys import re def getPage(url,offset = '0'): realurl = "%s%s%s" %(url,offset,'.shtml') print realurl resp = urllib2.urlopen(realurl) content = resp.read() #print content p = re.compile('<[^>]+>') p1=re.compile('<[^>p]+>') print p rematch = re.compile(r'(<h1.*</h1>)') h1 = rematch.findall(content) print h1[0],'ok' try: h1content = p.sub("",h1[0]) print h1content except Exception,e: print str(e),'error' return fp = open(r'juyudao.txt','a') fp.write(h1content+ ' ') fp.flush() #print content content = content.replace(' ','') content = content.replace(' ','') content = content.replace(' ','') content = content.replace(' ','') cont = re.search('articleBody(.*)class="pages">', content, re.S)#先获取一部分html #print 'cont1',cont.group() cont1=cont.group() articleBody=re.findall('</script>(.*)<divclass="pages">',cont1) #print articleBody articleBody=articleBody[0].replace('</p>','') articleBody=p1.sub('',articleBody) txt=articleBody.split('<p>') for i in txt: fp.write(i+ ' ') fp.flush() fp.close() def getBook(url, startoffset, endOffset): while startoffset < endOffset: getPage(url, offset = str(startoffset)) startoffset += 1 if __name__ == '__main__': getPage(url = 'http://book.sohu.com/20131107/n389762800',offset='') getBook(url = 'http://book.sohu.com/20131107/n389762800_',startoffset=1,endOffset=20)