Python爬取全书网后,对python几个模块的了解
新人,仅接触Python两天,未深入学习,代码较渣,勿喷.
直接发代码:
#!/usr/bin/python # -*- coding: utf-8 -*- #爬取速度较慢,主要是卡在urllib.openurl().read()上 import urllib, re, MySQLdb,time class Sql(object): #创建SQL类来处理数据操作 # conn=MySQLdb.connect(host="localhost", user="root",passwd= "root", db="quanshuwang",charset="utf8") conn = MySQLdb.connect(host="mysql.xxx.com", port=7150, user="noveltest", passwd="123456", db="noveltest", charset="utf8") def addBooks(self,sort,name,imgurl,des,status,author): cur = self.conn.cursor() sql="INSERT INTO novel(sort,name,imgurl,description,status,author) values(%s,'%s','%s','%s','%s','%s')"%(sort,name,imgurl,des,status,author) cur.execute(sql) lastrowid = cur.lastrowid self.conn.commit() return lastrowid def addChapters(self,novelid,title,content): pass cur=self.conn.cursor() sql="insert into chapter(novelid,title,content) values(%s,'%s','%s')"%(novelid,title,content) cur.execute(sql) lastrowid=cur.lastrowid self.conn.commit() return lastrowid def getBookList(): #取出所有书的名称和地址 print 'getbooklist:',time.strftime('%H:%M:%S') url='http://www.quanshuwang.com/map/1.html' html=urllib.urlopen(url).read() #获取HTML源码 html=html.decode('gbk').encode('utf-8') #先用decode()将源码的GBK转换为通用的UNICODE编码,再解压成UTF-8 reg=r'<a href="(/book/.*?)" target="_blank">(.*?)</a>' #正则,锁定(.*?) .为任意字符,*可匹配多个,?非贪婪模式,取最短那个 return re.findall(reg,html) #用RE.FINDALL取出正则匹配的文本 def getAuthor(bookurl): print 'StartgetAuthorAndbookurl:',bookurl,time.strftime('%H:%M:%S') bookurl=bookurl.split('/')[-2] url = 'http://www.quanshuwang.com/book_%s.html' %bookurl print 'startReadHtml:',time.strftime('%H:%M:%S') html = urllib.urlopen(url).read() print 'EndReadHtmlandStartDecode:',time.strftime('%H:%M:%S') html=html.decode('gbk').encode('utf-8') print 'StartgetAuthormsgbookandEndDecode:',time.strftime('%H:%M:%S') sort=re.findall(r'<a href="/list/(\d+)_.*?" class="c009900">.*?</a> >', html)[0] author=re.findall(r'<dt>作 者:</dt><dd> <a href=".*?">(.*?)</a></dd>',html)[0] status=re.findall(r'dl><dt>小说状态:</dt><dd>(.*?)</dd></dl>',html)[0] imgurl=re.findall(r'<img onerror="this.src=.*?" src="(.*?)" width="160" height="200" border="0" alt=".*?" title=".*?" />',html)[0] des=re.findall(r'div class="infoDetail"><div id="waa" style="height:72px;width:690px;overflow:hidden;">(.*?)<br />',html)[0] print 'EndgetAuthormsgbook:',time.strftime('%H:%M:%S') bookmsg={'sort':sort,'author':author,'status':status,'imgurl':imgurl,'des':des} return bookmsg def getChapterList(bookurl):#得到每篇文章的章节 url='http://www.quanshuwang.com%s'%bookurl html=urllib.urlopen(url).read().decode('gb2312').encode('utf-8') reg=r'<li><a href="(.*?)" title=".*?">(.*?)</a></li>' return re.findall(reg,html) def getContent(bookurl,chapterurl): #取得每篇文章的内容 url='http://www.quanshuwang.com%s'%bookurl.replace(bookurl.split('/')[-1],chapterurl) #用章节地址替换书目地址里的链接 html=urllib.urlopen(url).read().decode('gbk').encode('utf-8') reg=r'style5\(\);</script>(.*?)<script type="text/javascript">style6' return re.findall(reg,html)[0] mysql=Sql() #打开数据库链接 for bookname in getBookList(): #循环爬取小说内容 print 'for:',time.strftime('%H:%M:%S') msgbook={} bookurl=bookname[0] #书的URL name=bookname[1] #书名 print 'Startmsgbooklen',len(msgbook) msgbook=getAuthor(bookurl) print 'EndgetauthorandEndmsgbook:',len(msgbook),time.strftime('%H:%M:%S') sort=msgbook['sort'] author = msgbook['author'] imgurl = msgbook['imgurl'] des = msgbook['des'] status=msgbook['status'] urllib.urlretrieve(imgurl, "images/%s.jpg" %name.decode('utf-8'))#因为当前环境为utf-8,故要把书名转成unicode #print type(sort),type(author),type(name),type(imgurl),type(des),type(status) # novelid=mysql.addBooks(int(sort),name,imgurl,des,status,author) print '正在存储小说%s'%name # for chapter in getChapterList(bookurl): # chapterurl=chapter[0] #章节URL # chaptername=chapter[1] #章节名称 # content=getContent(bookurl,chapterurl) #小说内容 # print '正在存储章节%s'%chaptername # mysql.addChapters(novelid,chaptername,content) mysql.conn.close()