首页 » 技术分享 » 小说下载器

小说下载器

 

该程序使用了分布式进程的方法,在服务器端发布任务。在多台客户端同时进行下载,大大的提高了下载效率,同时,在对页面进行解析时使用了多个进程,分别用于提取新的URL和章节内容,以下是源码

 


服务器端源码:

给部分源码分为两部分,第一部分为任务发布器,第二部分将各个客户端返回的数据写入TXT文件中

第一部分:

#coding:utf-8

从multiprocessing.managers 导入队列
导入时间
从多处理导入BaseManager
导入
来自DataOutputToTxt的freeze_support 导入DataOutput

task_queue = queue.Queue()
result_queue = queue.Queue()
dataOutput = DataOutput()

def get_task():
    return task_queue

def get_result():
    返回result_queue

class QueeuManager(BaseManager):
    传递

def win_run():
    QueeuManager.register('get_task_queue',callable = get_task)
    QueeuManager.register('get_result_queue',callable = get_result)

    manager = QueeuManager(address =('127.0.0.1',8001),authkey ='cyl'.encode('utf-8'))
    manager.start()
    try:
            task = manager.get_task_queue()
            result = manager.get_result_queue ()
            num = int(input(“输入要下载的书籍数量\ n”))

            对于范围内的i(int(num)):
                book = {}
                book_name = input(“请输入第{0}本书的书名\ n”.format(i + 1))
                book_url = input(“请输入书籍”链接(以0结尾)\ n“)
                book_url = book_url [0:-1]
                book ['name
                '] = book_name book ['url'] = book_url
                task.put(book)

            而num> 0:
                如果不是result.empty():
                    data = result.get()
                    dataOutput.output_txt(data)
                    num = num - 1
                    print(“还剩{0}本书未爬取”.format(num) )
                else:
                    time.sleep(5)
    除了:
        print('爬取失败')
    finally:
        manager.shutdown()
        print(“爬取结束”)

if __name__ =='__ main__':
    freeze_support()
    win_run()

 

第二部分:

导入编解码器

class DataOutput(object):

    def output_txt(self,datas):        book_name = datas
        [0] +'。
txt'fout = codecs.open(book_name,'w',encoding ='utf-8')
        datas.pop(0)
        表示数据中的数据:
            fout.write(“%s”%data ['title'])
            fout.write(“\ r \ n”)
            fout.write(“%s”%data ['zhengwen'])
            fout.write(“\ r \ n \ n“)
            fout.flush()
        fout.close()

 

客户端源码 

任务接收器

# coding :utf-8

from SpiderMan import SpliderMan
from multiprocessing.managers import BaseManager
from multiprocessing import freeze_support
class QueueManager(BaseManager):
    pass

def taskWorker():
    server_address = '172.29.200.198'
    m = QueueManager(address=(server_address, 8001), authkey='cyl'.encode('utf-8'))
    m.connect()
    task = m.get_task_queue()
    result = m.get_result_queue()
    while (not task.empty()):
        startCrawl = SpliderMan()
        book = task.get()
        book_name = book['name']
        book_url = book['url']
        print("开始爬取{0}".format(book_name))
        datas = startCrawl.start(book_url, book_name)
        result.put(datas)
        # 将爬取结果返回result队列

if __name__ == '__main__':
    freeze_support()
    QueueManager.register('get_task_queue')
    QueueManager.register('get_result_queue')
    taskWorker()

 

爬虫调度器

from DataOutput import DataOutput
from HtmlDownloader import HtmlDownloader
from HtmlParser import HtmlParser
from UrlManager import UrlManager
from login import login
class SpliderMan(object):
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()
        self.datas = []
    def crawl(self,root_url,bookName):
        self.manager.add_new_url(root_url)
        self.datas.append(bookName)
        while(self.manager.has_new_urls()):
            try:
                new_url = self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls,data = self.parser.parser(html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data,self.datas)
                print("已经抓取%s个链接"%self.manager.old_url_size())

            except Exception as e:
                print(e)
                print("crawl failed")
                break
        return self.datas
    def start(self,url,bookName):
        pwd = 'long0000'
        username = 'qingliu'
        logIn = login(pwd, username)
        logIn.start()
        splider_man = SpliderMan()
        books = splider_man.crawl(url,bookName)
        return books

 

页面下载器

import requests
class HtmlDownloader(object):
    def download(self,url):
        if url is None:
            return None
        agent = 'Mozilla/5.0 (windows NT 5.1;rv:33.0) Gecko/20100101 Firefox/33.0'
        headers = {'User_agent':agent}
        r = requests.get(url,headers=headers)

        if r.status_code == 200:
            r.encoding = 'gbk'
            return r.text
        return None

页面解析器

import requests
class HtmlDownloader(object):
    def download(self,url):
        if url is None:
            return None
        agent = 'Mozilla/5.0 (windows NT 5.1;rv:33.0) Gecko/20100101 Firefox/33.0'
        headers = {'User_agent':agent}
        r = requests.get(url,headers=headers)

        if r.status_code == 200:
            r.encoding = 'gbk'
            return r.text
        return None

数据存储器:与服务器端数据存储器不同,该部分只是将同一本小说的各个章节整合打包成List

import codecs

class DataOutput(object):

    def store_data(self,data,datas):
        if data is None:
            return
        datas.append(data)

 

URL管理器

class UrlManager(object):
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    def has_new_urls(self):
        return len(self.new_urls) != 0

    def get_new_url(self):
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url

    def add_new_url(self,url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)

    def add_new_urls(self,urls):
        if urls is None:
            return
        for url in urls:
            self.add_new_url(url)

    def old_url_size(self):
        return len(self.old_urls)

 

网站登录器

import requests
class login(object):
    def __init__(self,pwd,username):
        self.agent = 'Mozilla/5.0 (windows NT 5.1;rv:33.0) Gecko/20100101 Firefox/33.0'
        self.header = {'User-Agent': self.agent}
        self.post_url = 'http://www.biquge.com.tw/login.php?do=submit&action=login&usecookie=1&jumpurl='
        self.postdata = {
            'password': pwd,
            'username': username, }
    def start(self):
        session = requests.session()
        login_page = session.post(self.post_url, data=self.postdata, headers=self.header)
        print(login_page.status_code)
if __name__ == '__main__':
    pwd = 'xxxxx'
    username = 'xxxxxx'
    login = login(pwd,username)
    login.start()

 

 


结果展示

服务器端:

 

 

 

客户端:

 

爬取结果

源码下载:https://download.csdn.net/download/qq_41484460/10578981 

转载自原文链接, 如需删除请联系管理员。

原文链接:小说下载器,转载请注明来源!

0