zoukankan      html  css  js  c++  java
  • Python爬虫(三)--百度贴吧

    1. #coding=utf-8
    2. import urllib2
    3. import urllib
    4. import re
    5. # 处理页面标签类
    6. class Tool:
    7. # 去除img标签,7位长空格
    8. removeImg = re.compile(r'<img.*?>| {7}|')
    9. # 删除超链接标签
    10. removeAddr = re.compile('<a.*?>|</a>')
    11. # 把换行标签换位
    12. replaceLine = re.compile('<tr>|<div>|</div|</p>')
    13. # 将制表<td>换位
    14. replaceTD = re.compile('<td>')
    15. # 将段落开头换为 加两空格
    16. replacePara = re.compile('<p.*?>')
    17. # 将换行符或双换行符换为
    18. replaceBR = re.compile('<br><br>|<br>')
    19. # 删除其他标签
    20. removeExtraTag = re.compile('<.*?>')
    21. def replace(self,x):
    22. x = re.sub(self.removeImg,"",x)
    23. x = re.sub(self.removeAddr,"",x)
    24. x = re.sub(self.replaceLine," ",x)
    25. x = re.sub(self.replaceTD," ",x)
    26. x = re.sub(self.replacePara," ",x)
    27. x = re.sub(self.replaceBR," ",x)
    28. x = re.sub(self.removeExtraTag,"",x)
    29. # 将前后多余的内容删除
    30. return x.strip()
    31. # 百度贴吧爬虫类
    32. class BDTB:
    33. # 初始化,传入基地址,传入是否“只看楼主”参数
    34. def __init__(self,baseUrl,seeLz,floorTag):
    35. # base连接地址
    36. self.basURL = baseUrl
    37. # 是否只看楼主
    38. self.seeLZ = '?see_lz='+str(seeLz)
    39. # 工具类
    40. self.tool = Tool()
    41. # 全局file变量,文件写入操作对象
    42. self.file = None
    43. # 楼层标号,初始为1
    44. self.floor = 1
    45. # 默认的标题
    46. self.defaultTitle = u"百度贴吧"
    47. # 是否写入楼分隔符的标记
    48. self.floorTag = floorTag
    49. # 获取该页(pageNum)的代码
    50. def getPage(self,pageNum):
    51. try:
    52. url = self.basURL + self.seeLZ + '&pn=' + str(pageNum)
    53. request = urllib2.Request(url)
    54. response = urllib2.urlopen(request)
    55. # 返回utf-8格式编码内容
    56. return response.read().decode('utf-8')
    57. except urllib2.URLError,e:
    58. if hasattr(e,"reason"):
    59. print u"连接百度贴吧失败,错误原因",e.reason
    60. return None
    61. # 获得页面标题
    62. def getTitle(self,page):
    63. # 获取标题的正则表达式
    64. pattern = re.compile(r'<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
    65. result = re.search(pattern, page)
    66. if result:
    67. return result.group(1).strip()
    68. else:
    69. return None
    70. # 提取页数
    71. def getPageNum(self,page):
    72. # 获取页数的正则表达式
    73. pattern = re.compile(r'reply_num.*?</span.*?class="red">(.*?)</span>',re.S)
    74. result = re.search(pattern,page)
    75. if result:
    76. return result.group(1).strip()
    77. else:
    78. return None
    79. # 提取内容
    80. def getContent(self,page):
    81. pattern = re.compile(r'post_content.*?>(.*?)</div>',re.S)
    82. items = re.findall(pattern,page)
    83. contents = []
    84. for item in items:
    85. #将文本内容进行处理,同时在前后加上换行符
    86. content = " " + self.tool.replace(item)+" "
    87. contents.append(content.encode('utf-8'))
    88. return contents
    89. def setFileTitle(self,title):
    90. if title is not None:
    91. self.file = open(title+".txt","w+")
    92. else:
    93. self.file = open(self.defaultTitle+".txt","w+")
    94. # 写文件
    95. def writeData(self,contents):
    96. for item in contents:
    97. if self.floorTag == "1":
    98. floorLine = " "+str(self.floor)+u"-------------------------------------- "
    99. self.file.write(floorLine)
    100. self.file.write(item)
    101. self.floor += 1
    102. def start(self):
    103. indexPage = self.getPage(1)
    104. pageNum = self.getPageNum(indexPage)
    105. title = self.getTitle(indexPage)
    106. self.setFileTitle(title)
    107. if pageNum == None:
    108. print u"URL已失效,请重试"
    109. return
    110. try:
    111. print "该帖子共有"+str(pageNum)+"页。"
    112. for i in range(1,int(pageNum)+1):
    113. print "正在写入第"+str(i)+"页数据"
    114. page = self.getPage(i)
    115. contents = self.getContent(page)
    116. self.writeData(contents)
    117. except IOError,e:
    118. print "写入异常,原因:"+e.message
    119. finally:
    120. print "写入完成"
    121. baseURL = 'http://tieba.baidu.com/p/3138733512'
    122. seeLz = raw_input("是否只获取楼主发言,是输入1,否输入0 ")
    123. floorTag = raw_input('是否写入楼层信息,是输入1,否输入0 ')
    124. bdtb = BDTB(baseURL,seeLz,floorTag)
    125. bdtb.start()




  • 相关阅读:
    CentOS7学习小记
    PHP的time函数返回时间不正确
    MySQL信息提示不是英文问题
    windows下ITOP安装
    Zend安装
    投资日记2015.6
    解决SecureCRT连接linux超时后断开[转]
    红黑树及生成超过32768随机数
    在宏定义中怎么使用可变参数
    C++堆上申请二维数组
  • 原文地址:https://www.cnblogs.com/aniudcs/p/ff8bf2f784e6207675c0571ff4470be7.html
Copyright © 2011-2022 走看看