# -o- encoding: utf-8 -o-
    """
    @author: sichu
    @contact: delnssn@gmail.com
    @software: PyCharm
    @file: main.py
    @time: 2017-10-24 16:32
    """
    
    import urllib    
    import re
       
    
    import MySQLdb
    
    # 解析html
    from bs4 import BeautifulSoup
    # 网络包
    import urllib2
    
    import sys
    import time

    reload(sys)
    sys.setdefaultencoding('utf8')
    
    conn = MySQLdb.connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='baidu',
        use_unicode=True,
        charset="utf8"
    )
    
    
    # 百度贴吧爬虫类
    class BDTB:
        # 初始化,传入基地址,是否只看楼主的参数
        def __init__(self, baseUrl="http://tieba.baidu.com/p/"):
            self.baseURL = baseUrl
    
        # 传入页码,获取该页帖子标题和内容
        def getPage(self, pageNum):
            try:
                url = self.baseURL + str(pageNum)  # + self.seeLZ + '&pn=' + str(pageNum)
                print url
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                # print response.read()
                soup = BeautifulSoup(response.read(), "lxml")
                try:
                    title = soup.select(".core_title_txt")[0]
                    comment = soup.select(".d_post_content.j_d_post_content.clearfix")[0]
                    page = soup.select(".red")[1].text
                except BaseException:
                    print "getPage -> select -> 错误"
                    return None
                dict = {}
                dict["uid"] = str(pageNum)
                dict["title"] = title.text
                dict["comment"] = comment.text
                dict["message"] = self.getMessage(url, page)
    
                # print response.read()
                # print url
                return dict
            except urllib2.URLError, e:
                if hasattr(e, "reason"):
                    print u"连接百度贴吧失败,错误原因", e.reason
                    return None
    
        # 获取留言信息 (只有1级留言)
        def getMessage(self, url, page):
            message = []
            for i in range(0, int(page)):
                url1 = url + "?pn=" + str(i + 1)
                # print url
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                soup = BeautifulSoup(response.read(), "lxml")
                for s in soup.select(".p_content.p_content.p_content_nameplate"):
                    message.append(s.text)
                time.sleep(1)
            return message
    
        # 插入百度帖子获得帖子所有对应的网址
        def getComment(self):
            import re
            request = urllib2.Request(
                "http://tieba.baidu.com/f?kw=%E6%B9%96%E5%8D%97%E7%A7%91%E6%8A%80%E5%AD%A6%E9%99%A2&ie=utf-8")
            response = urllib2.urlopen(request)
            # print response.read() # 读取百度贴吧测试
            soup = BeautifulSoup(response.read(), "lxml")
            list = soup.select("li.j_thread_list.clearfix")
            i = 0
            arr = []
            for l in list:
                arr.append(re.findall('{"id":(.+?),"', l.get("data-field"))[0])
                i += 1
    
            return arr
    
        # 运行一次
        def runOne(self):
            list = self.getComment()
            array = []
            for l in list:
                time.sleep(1)
    
    
                page = self.getPage(l)
                if page != None:
                    # array.append(page)
                    a = page
                    # print l
    
                    # print len(array)
    
                # for a in array:
                date = "insert into baidu(uid,name,title,comment,date) values('bdtg_" + a["uid"] + "','" + "百度贴吧" + "','" + \
                       a[
                           "title"] + "','" + a["comment"] + "',now());"
                try:
                    cursor = conn.cursor()
                    cursor.execute(date)
    
                    for m in a["message"]:
                        message = "insert into message(uid,message) values('bdtg_" + a["uid"] + "','" + m + "');"
                        # print message
                        try:
                            cursor.execute(message)
                        except BaseException:
                            print "runOne -> message -> 插入错误"
                        else:
                            conn.commit()
    
                except BaseException:
                    print "runOne -》数据库插入错误"
                else:
                    conn.commit()
                    cursor.close()
    
        def run(self):
            i = 0
            while True:
                self.runOne()
                print "结束 运行第"+ str(i) +"次"
    
    
    if __name__ == "__main__":
        cursor = conn.cursor()
        bdtb = BDTB()
        bdtb.run()
        conn.close()
最后修改:2017 年 10 月 26 日 03 : 11 PM