Python爬取全书网后,对python几个模块的了解

  • 2017-06-28
  • 0

新人,仅接触Python两天,未深入学习,代码较渣,勿喷.

直接发代码:

#!/usr/bin/python
# -*- coding: utf-8 -*-
#爬取速度较慢,主要是卡在urllib.openurl().read()上
import urllib, re, MySQLdb,time

class Sql(object):   #创建SQL类来处理数据操作
    # conn=MySQLdb.connect(host="localhost", user="root",passwd= "root", db="quanshuwang",charset="utf8")
    conn = MySQLdb.connect(host="mysql.xxx.com", port=7150, user="noveltest", passwd="123456", db="noveltest", charset="utf8")
    def addBooks(self,sort,name,imgurl,des,status,author):
        cur = self.conn.cursor()
        sql="INSERT INTO novel(sort,name,imgurl,description,status,author) values(%s,'%s','%s','%s','%s','%s')"%(sort,name,imgurl,des,status,author)
        cur.execute(sql)
        lastrowid = cur.lastrowid
        self.conn.commit()
        return lastrowid
    def addChapters(self,novelid,title,content):
        pass
        cur=self.conn.cursor()
        sql="insert into chapter(novelid,title,content) values(%s,'%s','%s')"%(novelid,title,content)
        cur.execute(sql)
        lastrowid=cur.lastrowid
        self.conn.commit()
        return lastrowid

def getBookList():  #取出所有书的名称和地址
    print  'getbooklist:',time.strftime('%H:%M:%S')
    url='http://www.quanshuwang.com/map/1.html'
    html=urllib.urlopen(url).read()   #获取HTML源码
    html=html.decode('gbk').encode('utf-8') #先用decode()将源码的GBK转换为通用的UNICODE编码,再解压成UTF-8
    reg=r'<a href="(/book/.*?)" target="_blank">(.*?)</a>' #正则,锁定(.*?) .为任意字符,*可匹配多个,?非贪婪模式,取最短那个
    return re.findall(reg,html)  #用RE.FINDALL取出正则匹配的文本

def getAuthor(bookurl):
    print  'StartgetAuthorAndbookurl:',bookurl,time.strftime('%H:%M:%S')
    bookurl=bookurl.split('/')[-2]
    url = 'http://www.quanshuwang.com/book_%s.html' %bookurl
    print 'startReadHtml:',time.strftime('%H:%M:%S')
    html = urllib.urlopen(url).read()
    print 'EndReadHtmlandStartDecode:',time.strftime('%H:%M:%S')
    html=html.decode('gbk').encode('utf-8')
    print  'StartgetAuthormsgbookandEndDecode:',time.strftime('%H:%M:%S')
    sort=re.findall(r'<a href="/list/(\d+)_.*?" class="c009900">.*?</a> >', html)[0]
    author=re.findall(r'<dt>作      者:</dt><dd> <a href=".*?">(.*?)</a></dd>',html)[0]
    status=re.findall(r'dl><dt>小说状态:</dt><dd>(.*?)</dd></dl>',html)[0]
    imgurl=re.findall(r'<img onerror="this.src=.*?" src="(.*?)" width="160" height="200" border="0" alt=".*?" title=".*?" />',html)[0]
    des=re.findall(r'div class="infoDetail"><div id="waa" style="height:72px;width:690px;overflow:hidden;">(.*?)<br />',html)[0]
    print  'EndgetAuthormsgbook:',time.strftime('%H:%M:%S')
    bookmsg={'sort':sort,'author':author,'status':status,'imgurl':imgurl,'des':des}
    return bookmsg

def getChapterList(bookurl):#得到每篇文章的章节
    url='http://www.quanshuwang.com%s'%bookurl
    html=urllib.urlopen(url).read().decode('gb2312').encode('utf-8')
    reg=r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>'
    return re.findall(reg,html)

def getContent(bookurl,chapterurl): #取得每篇文章的内容
    url='http://www.quanshuwang.com%s'%bookurl.replace(bookurl.split('/')[-1],chapterurl) #用章节地址替换书目地址里的链接
    html=urllib.urlopen(url).read().decode('gbk').encode('utf-8')
    reg=r'style5\(\);</script>(.*?)<script type="text/javascript">style6'
    return re.findall(reg,html)[0]

mysql=Sql() #打开数据库链接

for bookname in getBookList():  #循环爬取小说内容
    print  'for:',time.strftime('%H:%M:%S')
    msgbook={}
    bookurl=bookname[0]   #书的URL
    name=bookname[1]    #书名
    print 'Startmsgbooklen',len(msgbook)
    msgbook=getAuthor(bookurl)
    print  'EndgetauthorandEndmsgbook:',len(msgbook),time.strftime('%H:%M:%S')
    sort=msgbook['sort']
    author = msgbook['author']
    imgurl = msgbook['imgurl']
    des = msgbook['des']
    status=msgbook['status']
    urllib.urlretrieve(imgurl, "images/%s.jpg" %name.decode('utf-8'))#因为当前环境为utf-8,故要把书名转成unicode
    #print type(sort),type(author),type(name),type(imgurl),type(des),type(status)
    # novelid=mysql.addBooks(int(sort),name,imgurl,des,status,author)
    print '正在存储小说%s'%name
    # for chapter in getChapterList(bookurl):
    #     chapterurl=chapter[0]  #章节URL
    #     chaptername=chapter[1]  #章节名称
    #     content=getContent(bookurl,chapterurl)  #小说内容
    #     print '正在存储章节%s'%chaptername
    #     mysql.addChapters(novelid,chaptername,content)
mysql.conn.close()