首页 » 技术分享 » 塔多漫画Python爬虫—妈妈再也不用担心我看漫画了

塔多漫画Python爬虫—妈妈再也不用担心我看漫画了

 

塔多漫画使用了JavaScript加密无法直接找出漫画图片的真实地址,通过观察网页源代码发现真实的图片地址被加密过后存放在一个名为CP的变量中好在它自身已经给予了解密算法。

function base64decode(str)
    {var base64EncodeChars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    var base64DecodeChars=new Array(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1);
    var c1,c2,c3,c4;
    var i,len,out;
    len=str.length;i=0;
    out="";
    outq="";
    while(i<len){do{c1=base64DecodeChars[str.charCodeAt(i++)&255]}
    while(i<len&&c1==-1);if(c1==-1){break}do{c2=base64DecodeChars[str.charCodeAt(i++)&255]}
    while(i<len&&c2==-1);if(c2==-1){break}out+=String.fromCharCode((c1<<2)|((c2&48)>>4));do{c3=str.charCodeAt(i++)&255;if(c3==61){outq=eval(out.slice(4));return outq}c3=base64DecodeChars[c3]}
    while(i<len&&c3==-1);if(c3==-1){break}out+=String.fromCharCode(((c2&15)<<4)|((c3&60)>>2));do{c4=str.charCodeAt(i++)&255;if(c4==61){outq=eval(out.slice(4));return outq}c4=base64DecodeChars[c4]}
    while(i<len&&c4==-1);if(c4==-1){break}out+=String.fromCharCode(((c3&3)<<6)|c4)}outq=eval(out.slice(4));return outq}; 

这里使用的是base64的加密算法不想在重新用Python来构建解密函数了所以这里使用了JS2PY这个函数来解密

PIP install JS2PY

程序有点混乱给有用的同学吧塔多漫画有一定的反爬虫机制特意构建了请求的header如果不能使用了请自行更换header

源码:

import js2py
import re
from urllib.request import urlretrieve
import urllib.request

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"}  

def downpic(url,ji):
    urldata = urllib.request.urlopen(url)
    datap = urldata.read()
    datamat = b'cp=\".*?\"'
    datapd=re.search(datamat,datap)
    cp =str(datapd[0],'utf-8')
    cp =cp.replace('cp=\"', '')
    cp =cp.replace('\"', '')
    urljs = js2py.eval_js('''
    function base64decode(str)
    {var base64EncodeChars="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
    var base64DecodeChars=new Array(-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,62,-1,-1,-1,63,52,53,54,55,56,57,58,59,60,61,-1,-1,-1,-1,-1,-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,-1,-1,-1,-1,-1,-1,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,-1,-1,-1,-1,-1);
    var c1,c2,c3,c4;
    var i,len,out;
    len=str.length;i=0;
    out="";
    outq="";
    while(i<len){do{c1=base64DecodeChars[str.charCodeAt(i++)&255]}
    while(i<len&&c1==-1);if(c1==-1){break}do{c2=base64DecodeChars[str.charCodeAt(i++)&255]}
    while(i<len&&c2==-1);if(c2==-1){break}out+=String.fromCharCode((c1<<2)|((c2&48)>>4));do{c3=str.charCodeAt(i++)&255;if(c3==61){outq=eval(out.slice(4));return outq}c3=base64DecodeChars[c3]}
    while(i<len&&c3==-1);if(c3==-1){break}out+=String.fromCharCode(((c2&15)<<4)|((c3&60)>>2));do{c4=str.charCodeAt(i++)&255;if(c4==61){outq=eval(out.slice(4));return outq}c4=base64DecodeChars[c4]}
    while(i<len&&c4==-1);if(c4==-1){break}out+=String.fromCharCode(((c3&3)<<6)|c4)}outq=eval(out.slice(4));return outq}; 
    ''')
    mc = "20\d\d\/.*?jpg"
    cp_data=urljs(cp)
    pic=re.findall(mc,cp_data)
    pic_len = len(pic)
    #print("http://fn.taduo.net/" +pic[1])
    #print(pic_len)
    print(pic)
    for i in range(pic_len):
        urldd="http://fn.taduo.net/" + pic[i]
        pathd=str(ji)+ "_" +str(i) + ".jpg"
        downpict (urldd, pathd)
        print(str(ji)+'正在下载'+str(i) )


def down_pic(url, path):
    try:
        req = request.Request(url, headers=headers)
        print(req)
        data = request.urlopen(req).read()
        print(data)
        with open(path, 'wb') as f:
            f.write(data)
            f.close()
    except Exception as e:
        print(str(e))


def downpict(url,Path):
    opener=urllib.request.build_opener()
    opener.addheaders=[('User-Agent','Mozilla/5.0 (Linux; U; Android 8.1.0; en-us; MI 8 Build/OPM1.171019.011) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/61.0.3163.128 Mobile Safari/537.36 XiaoMi/MiuiBrowser/10.1.1')]
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(url, Path)

stn =39
end = 100
#412
urld = "http://www.taduo.net/manhua/24/"
req = urllib.request.Request(urld)
req.add_header('User-Agent','Mozilla/5.0 (Linux; U; Android 8.1.0; en-us; MI 8 Build/OPM1.171019.011) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/61.0.3163.128 Mobile Safari/537.36 XiaoMi/MiuiBrowser/10.1.1')
urlli = urllib.request.urlopen(req)
datal = urlli.read()
for ii in range(stn,end+1):
    matli ='<li><a href=\"\/manhua\/24\/\d{2,6}\.html\" title=\"' + str(ii)
    matli =bytes(matli,encoding = "utf8")
    urlld =re.search(matli,datal)
    urlld =str(urlld[0],'utf-8')
    urlld = urlld.replace('<li><a href=\"','')
    urlld = urlld.replace('\" title=\"'+str(ii),'')
    urlld = "http://m.taduo.net"+urlld
    print(ii)
    print(urlld)
    downpic(urlld,ii)

转载自原文链接, 如需删除请联系管理员。

原文链接:塔多漫画Python爬虫—妈妈再也不用担心我看漫画了,转载请注明来源!

0