Python爬取全书网后,对python几个模块的了解
新人,仅接触Python两天,未深入学习,代码较渣,勿喷.
直接发代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
#爬取速度较慢,主要是卡在urllib.openurl().read()上
import urllib, re, MySQLdb,time
class Sql(object): #创建SQL类来处理数据操作
# conn=MySQLdb.connect(host="localhost", user="root",passwd= "root", db="quanshuwang",charset="utf8")
conn = MySQLdb.connect(host="mysql.xxx.com", port=7150, user="noveltest", passwd="123456", db="noveltest", charset="utf8")
def addBooks(self,sort,name,imgurl,des,status,author):
cur = self.conn.cursor()
sql="INSERT INTO novel(sort,name,imgurl,description,status,author) values(%s,'%s','%s','%s','%s','%s')"%(sort,name,imgurl,des,status,author)
cur.execute(sql)
lastrowid = cur.lastrowid
self.conn.commit()
return lastrowid
def addChapters(self,novelid,title,content):
pass
cur=self.conn.cursor()
sql="insert into chapter(novelid,title,content) values(%s,'%s','%s')"%(novelid,title,content)
cur.execute(sql)
lastrowid=cur.lastrowid
self.conn.commit()
return lastrowid
def getBookList(): #取出所有书的名称和地址
print 'getbooklist:',time.strftime('%H:%M:%S')
url='http://www.quanshuwang.com/map/1.html'
html=urllib.urlopen(url).read() #获取HTML源码
html=html.decode('gbk').encode('utf-8') #先用decode()将源码的GBK转换为通用的UNICODE编码,再解压成UTF-8
reg=r'<a href="(/book/.*?)" target="_blank">(.*?)</a>' #正则,锁定(.*?) .为任意字符,*可匹配多个,?非贪婪模式,取最短那个
return re.findall(reg,html) #用RE.FINDALL取出正则匹配的文本
def getAuthor(bookurl):
print 'StartgetAuthorAndbookurl:',bookurl,time.strftime('%H:%M:%S')
bookurl=bookurl.split('/')[-2]
url = 'http://www.quanshuwang.com/book_%s.html' %bookurl
print 'startReadHtml:',time.strftime('%H:%M:%S')
html = urllib.urlopen(url).read()
print 'EndReadHtmlandStartDecode:',time.strftime('%H:%M:%S')
html=html.decode('gbk').encode('utf-8')
print 'StartgetAuthormsgbookandEndDecode:',time.strftime('%H:%M:%S')
sort=re.findall(r'<a href="/list/(\d+)_.*?" class="c009900">.*?</a> >', html)[0]
author=re.findall(r'<dt>作 者:</dt><dd> <a href=".*?">(.*?)</a></dd>',html)[0]
status=re.findall(r'dl><dt>小说状态:</dt><dd>(.*?)</dd></dl>',html)[0]
imgurl=re.findall(r'<img onerror="this.src=.*?" src="(.*?)" width="160" height="200" border="0" alt=".*?" title=".*?" />',html)[0]
des=re.findall(r'div class="infoDetail"><div id="waa" style="height:72px;width:690px;overflow:hidden;">(.*?)<br />',html)[0]
print 'EndgetAuthormsgbook:',time.strftime('%H:%M:%S')
bookmsg={'sort':sort,'author':author,'status':status,'imgurl':imgurl,'des':des}
return bookmsg
def getChapterList(bookurl):#得到每篇文章的章节
url='http://www.quanshuwang.com%s'%bookurl
html=urllib.urlopen(url).read().decode('gb2312').encode('utf-8')
reg=r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>'
return re.findall(reg,html)
def getContent(bookurl,chapterurl): #取得每篇文章的内容
url='http://www.quanshuwang.com%s'%bookurl.replace(bookurl.split('/')[-1],chapterurl) #用章节地址替换书目地址里的链接
html=urllib.urlopen(url).read().decode('gbk').encode('utf-8')
reg=r'style5\(\);</script>(.*?)<script type="text/javascript">style6'
return re.findall(reg,html)[0]
mysql=Sql() #打开数据库链接
for bookname in getBookList(): #循环爬取小说内容
print 'for:',time.strftime('%H:%M:%S')
msgbook={}
bookurl=bookname[0] #书的URL
name=bookname[1] #书名
print 'Startmsgbooklen',len(msgbook)
msgbook=getAuthor(bookurl)
print 'EndgetauthorandEndmsgbook:',len(msgbook),time.strftime('%H:%M:%S')
sort=msgbook['sort']
author = msgbook['author']
imgurl = msgbook['imgurl']
des = msgbook['des']
status=msgbook['status']
urllib.urlretrieve(imgurl, "images/%s.jpg" %name.decode('utf-8'))#因为当前环境为utf-8,故要把书名转成unicode
#print type(sort),type(author),type(name),type(imgurl),type(des),type(status)
# novelid=mysql.addBooks(int(sort),name,imgurl,des,status,author)
print '正在存储小说%s'%name
# for chapter in getChapterList(bookurl):
# chapterurl=chapter[0] #章节URL
# chaptername=chapter[1] #章节名称
# content=getContent(bookurl,chapterurl) #小说内容
# print '正在存储章节%s'%chaptername
# mysql.addChapters(novelid,chaptername,content)
mysql.conn.close()
