diff --git a/README.md b/README.md index 24ea2bc..08cf230 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,9 @@ ## Запуск -Для **Windows** доступны бинарные сборки в разделе [Releases](https://github.com/aliasn3t/booklead/releases) -Для запуска кода потребуется Python с модулями **img2pdf**, **requests** и **beautifulsoup4** +Для **Windows** доступны бинарные сборки в разделе [Releases](https://github.com/andrey-09/booklead_2.0/releases) +Для запуска кода потребуется Python с модулями **img2pdf**, **requests**, **beautifulsoup4**, **aiohttp**, **img2pdf**, **numpy**, **opencv_python** и **pillow**. + Установка модулей: `python3 -m pip install -r requirements.txt` ## Использование diff --git a/booklead.py b/booklead.py index 4c258af..fb63b24 100644 --- a/booklead.py +++ b/booklead.py @@ -4,14 +4,23 @@ import os import re import urllib.parse - +import asyncio +from aiohttp import ClientSession,TCPConnector, ClientTimeout +import datetime +import time +import numpy as np +#import nest_asyncio #used for debugging +from util import CV2_Russian, BinaryToDecimal,number_of_images, Postprocess, Time_Processing +import cv2 +import random import img2pdf from bs4 import BeautifulSoup - +import sys from util import get_logger from util import md5_hex, to_float, cut_bom, perror, progress, ptext, safe_file_name, Browser, select_one_text_optional -from util import select_one_text_required, select_one_attr_required, gwar_fix_json - +from util import select_one_text_required, select_one_attr_required, gwar_fix_json,mkdirs_for_regular_file +from util import user_agents +import logging log = get_logger(__name__) BOOK_DIR = 'books' @@ -22,7 +31,25 @@ } prlDl_params = { - 'ext': 'jpg' + 'ext': 'jpg' } + +headers_pr1 = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Language': 'en-US,en;q=0.9', + 'Cache-Control': 'max-age=0', + 'Connection': 'keep-alive', + 'If-Modified-Since': 'Tue, 20 Dec 2016 02:17:59 GMT', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-User': '?1', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36', + 'dnt': '1', + 'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"', + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': '"Windows"', + 'sec-gpc': '1', } bro: Browser @@ -47,28 +74,59 @@ def saveImage(url, img_id, folder, ext, referer): expected_ct = re.compile('image/') bro.download(url, image_path, headers, content_type=expected_ct, skip_if_file_exists=True) - +async def fetch_image(url: str, i,queue, headers_pr1, sem): + """ Не добавляйте в util.py, у меня тогда asyncio не работал (может баг на моей стороне) + по url скачиваю картинку и добавляю в binary в queue asyncio + """ + async with sem: + async with ClientSession(headers=headers_pr1,timeout=ClientTimeout(total=5),trust_env=True) as session: #,trust_env=True + async with session.get(url) as response: + result = (i, await response.read()) + await queue.put(result) + +async def async_images(url,num,headers_pr1): + """Не добавляйте в util.py, у меня тогда asyncio не работал (может баг на моей стороне) + call every tile image to download in async mode (in the end, add binary with the image number to results_prlDl + """ + + sem = asyncio.Semaphore(100)##https://stackoverflow.com/questions/63347818/aiohttp-client-exceptions-clientconnectorerror-cannot-connect-to-host-stackover + queue = asyncio.Queue() + async with asyncio.TaskGroup() as group: #https://blog.csdn.net/y662225dd/article/details/135273140 + for i in range(num): + headers_pr1.update({'User-Agent': random.choice(user_agents)}) + group.create_task(fetch_image(url.format(i), i,queue,headers_pr1,sem)) + + global results_prlDl + results_prlDl=[] + while not queue.empty(): + results_prlDl.append(await queue.get()) + def eshplDl(url): ext = eshplDl_params['ext'] quality = eshplDl_params['quality'] domain = urllib.parse.urlsplit(url).netloc html_text = bro.get_text(url) + soup = BeautifulSoup(html_text, 'html.parser') title = select_one_text_optional(soup, 'title') or md5_hex(url) title = safe_file_name(title) + for script in soup.findAll('script'): + + st = str(script) + if 'initDocview' in st: book_json = json.loads(st[st.find('{"'): st.find(')')]) - ptext(f' ─ Каталог для загрузки: {title}') + ptext(f' Каталог для загрузки: {title}') pages = book_json['pages'] for idx, page in enumerate(pages): img_url = f'http://{domain}/pages/{page["id"]}/zooms/{quality}' saveImage(img_url, idx + 1, title, ext, url) - progress(f' ─ Прогресс: {idx + 1} из {len(pages)} стр.') + progress(f' Прогресс: {idx + 1} из {len(pages)} стр.') return title, ext - + def prlDl(url): """ @@ -81,20 +139,70 @@ def prlDl(url): soup = BeautifulSoup(html_text, 'html.parser') title = select_one_text_optional(soup, 'h1') or md5_hex(url) title = safe_file_name(title) - ptext(f' ─ Каталог для загрузки: {title}') - for script in soup.findAll('script'): + ptext(f'Каталог для загрузки: {title}') + + for script in soup.find_all('script'): #findAll deprecated st = str(script) if 'jQuery.extend' in st: book_json = json.loads(st[st.find('{"'): st.find(');')]) - book = book_json['diva']['1']['options'] + if "item" in url.split("prlib.ru/")[1]: #case for https://www.prlib.ru/item/*** + book = book_json['diva']['1']['options'] + elif "node" in url.split("prlib.ru/")[1]: #case for https://www.prlib.ru/node/*** + book = book_json['diva']['settings'] json_text = bro.get_text(book['objectData']) book_data = json.loads(json_text) pages = book_data['pgs'] + num_of_pages_down=0 #for the time prediction + start=datetime.datetime.now()#for the time prediction for idx, page in enumerate(pages): - img_url = 'https://content.prlib.ru/fcgi-bin/iipsrv.fcgi?FIF={}/{}&WID={}&CVT=jpeg'.format( - book['imageDir'], page['f'], page['d'][len(page['d']) - 1]['w']) - saveImage(img_url, idx + 1, title, ext, url) - progress(f' ─ Прогресс: {idx + 1} из {len(pages)} стр.') + + img_url = 'https://content.prlib.ru/fcgi-bin/iipsrv.fcgi?FIF={}/{}&JTL={},'.format( + book['imageDir'], page['f'], page['m']) #поменял здесь немного вид урл, так как по частям качаю + # брал урл отсюда: https://iipimage.sourceforge.io/documentation/protocol + img_url+="{}" + width, height=number_of_images(page["d"][len(page['d']) - 1]['w'],page["d"][len(page['d']) - 1]['h']) + + image_short = '%05d.%s' % (idx+1, ext) + image_path = os.path.join(BOOK_DIR, title, image_short) + + # заменяю все фичи ручками (например тут skip_if_exists), которые были ранее доступны через функции + #(т.к. метод у меня скачивания немного другой) + if os.path.exists(image_path) and os.stat(image_path).st_size > 0: + log.info(f'Пропускаю скачанный файл: {image_path}') + progress(f' Прогресс: {idx + 1} из {len(pages)} стр. ') + else: + mkdirs_for_regular_file(image_path) + #nest_asyncio.apply() # нужен только чтобы async работал нормально в Jupyter ( https://pypi.org/project/nest-asyncio/) + # получить все данные с картиники: + global headers_pr1 + headers_pr1.update({'Referer': url}) + + flag=True #для проверки на хороший requests + global results_prlDl + while flag: #just keep quering the connection + try: + + asyncio.run(async_images(img_url,width*height,headers_pr1)) #Downgrade to 3.6.2 #Using Python 3.8 https://blog.csdn.net/y662225dd/article/details/135273140 + #loop = asyncio.get_event_loop() #for old version of aiohttp: 3.6.2 + #loop.run_until_complete(async_images(img_url,width*height,headers)) + except Exception as Argument: #Error coding + time.sleep(1.0) + + logging.exception("Error occurred in ASYNCIO") + else: + if len(results_prlDl)!=0: + flag=False + + # просессить все данные и в конце вывести картинку + Postprocess(results_prlDl,width,height, image_path) + + # Time Formatting/Prediction: + prog=datetime.datetime.now()-start + num_of_pages_down+=1 + left=prog/num_of_pages_down*(len(pages)-(idx+1)) #based on the values before prediction + minutes, seconds = Time_Processing(left) + past_min, past_sec=Time_Processing(prog) + progress(f' Прогресс: {idx + 1} из {len(pages)} стр. | Прошло (мин:сек): {past_min}:{past_sec:02d} ;Осталось: {minutes}:{seconds:02d} ') return title, ext @@ -192,7 +300,7 @@ def gwarDL(url): book_dir = ('{}_{}'.format(book_id, title))[0:224] - ptext(f' ─ Каталог для загрузки: {book_dir}') + ptext(f' Каталог для загрузки: {book_dir}') request_headers = {'referer': url} json_text = bro.post_text(json_url, request_headers, request_data) @@ -212,7 +320,7 @@ def gwarDL(url): img_url = 'https://cdn.gwar.mil.ru/imagesfww/{}'.format( # либо ...ru/imageloadfull/ image_url) saveImage(img_url, idx + 1, book_dir, ext, 'https://gwar.mil.ru/') - progress(f' ─ Прогресс: {idx + 1} из {len(pages)} стр.') + progress(f' Прогресс: {idx + 1} из {len(pages)} стр.') return title, ext @@ -271,7 +379,7 @@ def main(): for url in urls: load = download_book(url) if load and args.pdf.lower() in ['y', 'yes']: - progress(' ─ Создание PDF...') + progress(' Создание PDF...') title, img_ext = load img_folder_full = os.path.join(BOOK_DIR, title) pdf_path = os.path.join(BOOK_DIR, f'{title}.pdf') diff --git a/requirements.txt b/requirements.txt index 9f25d26..53e8015 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ -beautifulsoup4==4.9.0 -img2pdf==0.3.6 -requests==2.25.1 +aiohttp==3.11.18 +beautifulsoup4==4.13.4 +img2pdf==0.6.1 +numpy==2.2.5 +opencv_python==4.11.0.86 +Requests==2.32.3 +pillow==11.2.1 \ No newline at end of file diff --git a/util.py b/util.py index 8f08a5e..44fe5f5 100644 --- a/util.py +++ b/util.py @@ -5,19 +5,22 @@ import functools import hashlib import logging +import numpy as np import os import random import re import shutil import sys import time - +import datetime +import cv2 import requests from bs4 import Tag from requests import Response from typing import Dict, Optional, Pattern, Union import json + user_agents = [ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36', @@ -27,6 +30,7 @@ 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36' ] @@ -89,8 +93,63 @@ def mkdirs_for_regular_file(filename: str): except OSError as e: # Guard against race condition if e.errno != errno.EEXIST: raise - - +def Time_Processing(timedelta): + """Чтоб время показывать + """ + minutes, seconds = divmod(round(timedelta.total_seconds()), 60) + return minutes, seconds + +def Postprocess(results_prlDl,width, height,image_path): + """ + Прохожу через бинарные данные в results_prlDl, ставлю их на правильные места в картинке исходной и вывожу все в файл, напртмер 0001.jpg + """ + Total_Image=[i for i in range(len(results_prlDl))] + for item in results_prlDl: + Total_Image[item[0]]=BinaryToDecimal(item[1],os.path.dirname(image_path)) + + os.remove(os.path.join(os.path.dirname(image_path), "test.jpg")) + regroup=[] + for h in range(height): + regroup.append(Total_Image[h*width:(h+1)*width]) + im_h=cv2.vconcat([cv2.hconcat(item) for item in regroup]) + + #cv2.imwrite(image_path, im_h) (doesn't work with Russian) + result, data = cv2.imencode('.jpg', im_h) + fh = open(image_path, 'wb') + fh.write(data) + fh.close() +def number_of_images(width, height): + """ + получаю кол-во картинок по ширине и длине (возможно можно в одну строчку как-то:) + """ + num_w=width//256 + if width%256!=0: + num_w+=1 + num_h=height//256 + if height%256!=0: + num_h+=1 + return int(num_w),int(num_h) + +def BinaryToDecimal(binary,image_path): + """ + тупой вариант перевода binary в decimal для картинки. остальные способы казались слишком) + """ + with open(os.path.join(image_path, "test.jpg"), "wb") as file: + file.write(binary) + dec=CV2_Russian(os.path.join(image_path, "test.jpg")) # название папки на Русском в названии мешало прочитать cv2 файл (это окалаось известный баг cv2) + return dec +def CV2_Russian(name): + """ + Чтение картинки с русским названием в пути в cv2 + #https://answers.opencv.org/question/205345/imread-and-russian-language-path-to-img/ + """ + f = open(name, "rb") + chunk = f.read() + chunk_arr = np.frombuffer(chunk, dtype=np.uint8) + img = cv2.imdecode(chunk_arr, cv2.IMREAD_COLOR) + return img + + def cut_bom(s: str): bom = codecs.BOM_UTF8.decode("utf-8") return s[len(bom):] if s.startswith(bom) else s @@ -115,7 +174,7 @@ def gwar_fix_json(s: str, a: bool = False) -> str: s = s.replace("'", '"') if a: # https://stackoverflow.com/questions/50947760/how-to-fix-json-key-values-without-double-quotes - s = re.sub("(\w+):", r'"\1":', s) + s = re.sub(r"(\w+):", r'"\1":', s) #added r: https://stackoverflow.com/questions/50504500/deprecationwarning-invalid-escape-sequence-what-to-use-instead-of-d json_s = json.loads(s) return json_s