申明,本人非专业python职业者。以下代码作为自己学习使用。
python版本 3.8.5 ,数据库 mongoDB版本 4.2
一、代码
1、数据库连接dbclient.py
import pymongo
def getDB():
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient['zhanlu']
return mydb
2、书本对象book的crud封装类 book.py
class Chapter:
_id = 0
bookId = 0
title = ""
number = 1
content = ""
url=""
def load(self,db,_id):
r = db["chapter"].find_one({"_id": _id})
convertToChapter(self,r)
def hasChapter(self,db,url):
r = db["chapter"].find_one({"url": url})
return r!=None
def queryChapter(self,db,title):
r= db["chapter"].find_one({"title":title})
convertToChapter(self,r)
def queryChapters(self, db, bookId,minnum,pageSize):
rs = db["chapter"].find({'book': bookId, 'number': {'$gt': minnum}},
{'title': 1, 'number': 1}).sort({'number': 1}).limit(pageSize)
chapters = []
if rs!=None and len(rs)>0:
for r in rs:
c =Chapter()
convertToChapter(c, r)
chapters.append(c)
return chapters
def saveChapter(self,db,chapter):
sql={'title': chapter.title,
'content': chapter.content,
'book': chapter.bookId,
'number':int(chapter.number),
'url':chapter.url
}
r=db["chapter"].insert_one(sql)
self._id=r.inserted_id
class Book:
_id = 0
title = ""
writerId = 0
writerName = ""
brief = ""
type = ""
chapterCount = 0
indexPageUrls = []
def load(self,db,_id):
r = db["book"].find_one({"_id": _id})
convertToBook(self,r)
def queryBook(self,db,title):
r= db["book"].find_one({"title":title})
convertToBook(self, r)
def saveBook(self,db,book):
sql={'title': book.title,
'brief': book.brief,
'writer': book.writerId,
'chapterCount':book.chapterCount,
'type':book.type
}
r=db["book"].insert_one(sql)
self._id=r.inserted_id
def convertToBook(b,r):
if r != None:
b._id = r["_id"]
b.title = r["title"]
b.writerId = r["writer"]
b.brief = getValue(r,"brief")
b.type = getValue(r,"type")
b.chapterCount = getValue(r,"chapterCount")
def convertToChapter(c,r):
if r != None:
c.id = getValue(r,"_id")
c.bookId = getValue(r,"book")
c.title = getValue(r,"title")
c.number = getValue(r,"number")
c.content = getValue(r,"content")
c.url = getValue(r,"url")
def getValue(r,col):
try:
return r[col]
except:
print("没有列 ",col)
return None
3、作者对象 write.py
class Writer:
_id = 0
name = ""
brief = ""
def queryWriter(self, db,name):
r = db["writer"].find_one({"name": name})
if r != None:
self._id = r["_id"]
self.name = r["name"]
self.brief = r["brief"]
def saveWriter(self, db):
sql = {'title': self.name,
'brief': self.brief
}
r=db["writer"].insert_one(sql)
self._id=r.inserted_id
def tostring(self):
print(self._id)
#print("{'_id':",self._id,",'name':'%s','brief':'brief'}" % (self.name,self.brief))
4、爬取文件代码 spider.py
import requests
import json
import bs4
class Spider(object):
def __init__(self,headers):
self.headers = headers
def read(self,url,params):
# 这里有个细节,如果body需要json形式的话,需要做处理
# 可以是data = json.dumps(body)
print(url,params)
response = requests.get(url, data=json.dumps(params), headers=self.headers)
# 返回信息
# 返回响应头
print("status_code :", response.status_code)
if response.status_code == 200:
# print(response.text)
return bs4.BeautifulSoup(response.content.decode("utf-8"), "lxml")
else:
return 'error'
5、乐文图书网址的一个爬取方法 lewen.py
#!/usr/bin/python3
# Filename: lewen.py
import spider
from library.book import Book
lewenhome= "http://www.lewenxsw.com"
headers = {'content-type': "text/html", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
def queryBook(url):
sp = spider.Spider(headers)
content =sp.read(url,{})
#print(content.title)
#print(content)
book = Book.Book()
#书名
elements = content.find_all('h1', class_='text-center')
book.title = elements[0].text
#简介
lements = content.find_all('div', class_='info2')
book.brief=lements[0].p.text
#print(book.brief)
#章节页面
# 作者:<a href="/author/priest">priest</a>
elements = content.find_all('a')
indexPageUrls=[]
if len(elements) > 0:
for item in elements:
itemhref=item.get("href")
if itemhref.endswith(".html") and itemhref.startswith("/5/5375/"):
#print(itemhref)
book.indexPageUrls.append(lewenhome+itemhref)
elif itemhref=="/author/priest":
book.writerName=item.text
book.chapterCount=len(book.indexPageUrls)
return book
def queryChapter(url):
sp = spider.Spider(headers)
content = sp.read(url, {})
#print(content)
chapter = Book.Chapter()
elements = content.find_all('div', class_='panel-heading')
title_str = elements[0].text
tarr = title_str.split('.')
if len(tarr)==2:
print(tarr[0])
chapter.number=tarr[0].strip()
title_str=tarr[1]
tarr = title_str.split('章')
if len(tarr)==2:
title_str=tarr[1]
chapter.title=title_str.strip()
elements = content.find_all('div',class_='content-body')
content_str = elements[0].text
chapter.content=content_str
print(chapter.title)
#print(chapter.content)
#print(len(content_str))
#for i in range(0,(int)(len(content_str) / 500)+1):
# print("i=",i,"\n\r")
# print(content_str[i*500:(i+1)*500])
return chapter
if __name__ == "__main__":
queryChapter("http://www.lewenxsw.com/5/5375/7558724.html")
#book = queryBook("http://www.lewenxsw.com/5/5375/")
#print("book title %s, and writer %s ,has %s chapter",book.title,book.writerName,book.chapterCount)
#if book.indexPageUrls != None and len(book.indexPageUrls) > 0:
# for url in book.indexPageUrls:
# print(url)
6、爬取一本书的执行代码 spiderBook.py
#!/usr/bin/python3
# Filename: spiderBook.py
import time
import random
import lewen
import db.dbclient as dbclient
import library.writer
import library.book
#from library.book import Chapter
def storebook(webtype,bookIndexUrl):
book=None
if webtype=='lewen':
#爬取书本首页
book = lewen.queryBook(bookIndexUrl)
else:
print("不支持webtype %s" % webtype)
return
db=dbclient.getDB()
#作者数据处理,没有在库里,先入库
w = library.writer.Writer()
w.queryWriter(db,book.writerName)
w.tostring()
if w._id==0:
w.name=book.writerName
w.brief = book.writerName
w.saveWriter(db)
print("save writer %s , id= %d" %(w.name,w._id))
#书本数据处理,没有在库里,先入库
book.writerId=w._id
b=library.book.Book()
b.queryBook(db,book.title)
if b._id == 0:
b.saveBook(db,book)
#章节数据处理,遍历每个章节,爬取,入库
if book.indexPageUrls!=None and len(book.indexPageUrls)>0:
for url in book.indexPageUrls:
ch=library.book.Chapter()
if ch.hasChapter(db,url):
print("已经有了,跳过。")
continue
stop()
c=lewen.queryChapter(url)
if c!=None:
c.bookId = b._id
c.url=url
ch.saveChapter(db,c)
print("save chapter ",ch._id)
def stop():
se=random.randint(1,5)
print("停顿一下:",se)
time.sleep(se)
if __name__ == "__main__":
storebook('lewen',"http://www.lewenxsw.com/5/5375/")
本文地址:https://blog.csdn.net/qq_39476654/article/details/110931345
黄山市民网:https://www.huangshanshimin.com/