Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
c709c44
Update README.md
andrey-09 May 8, 2025
5d6b100
Update README.md
andrey-09 May 8, 2025
3c9ab0e
Update README.md
andrey-09 May 8, 2025
42344ea
Update README.md
andrey-09 May 8, 2025
d9ba9a0
Bookleads 2.0. Support for prlib.ru. 1-2 seconds/image.
andrey-09 May 8, 2025
b7b9f55
Delete README.md
andrey-09 May 8, 2025
dcbdc62
Create README.md
andrey-09 May 8, 2025
2e6b839
Update README.md
andrey-09 May 8, 2025
70a42a1
Update util.py
andrey-09 May 8, 2025
2a8481b
Update booklead.py
andrey-09 May 8, 2025
2e96e8d
Update booklead.py
andrey-09 May 8, 2025
645d3cc
Update booklead.py
andrey-09 May 8, 2025
db7d532
Update booklead.py
andrey-09 May 8, 2025
45bc698
Update requirements.txt
andrey-09 May 8, 2025
db7056f
Update util.py
andrey-09 May 9, 2025
4620762
Update booklead.py
andrey-09 May 9, 2025
60f30e6
Update util.py
andrey-09 May 9, 2025
28ef27b
Добавил секунды форматированные и обновил пакеты для Python 3.11.11
andrey-09 May 10, 2025
5506d70
Exceptions of asyncio
andrey-09 May 10, 2025
662f2e2
Update README.md
andrey-09 May 10, 2025
03b020c
Update README.md
andrey-09 May 10, 2025
ffc21a0
Update README.md
andrey-09 May 10, 2025
d1ab857
Update README.md
andrey-09 May 10, 2025
8119329
Update README.md
andrey-09 May 10, 2025
11df598
Update README.md
andrey-09 May 11, 2025
0f2c677
Фиксил проблему с SSL connection. Отладил нетвокринг. Добавил огранич…
andrey-09 May 11, 2025
227c1cb
Merge pull request #1 from andrey-09/Local
andrey-09 May 11, 2025
827713f
Delete README.md
andrey-09 May 11, 2025
1c310a4
Create README.md
andrey-09 May 11, 2025
526aca0
Merge pull request #2 from andrey-09/To-merge
andrey-09 May 11, 2025
6690182
Update README.md
andrey-09 May 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@

## Запуск

Для **Windows** доступны бинарные сборки в разделе [Releases](https://github.com/aliasn3t/booklead/releases)
Для запуска кода потребуется Python с модулями **img2pdf**, **requests** и **beautifulsoup4**
Для **Windows** доступны бинарные сборки в разделе [Releases](https://github.com/andrey-09/booklead_2.0/releases)
Для запуска кода потребуется Python с модулями **img2pdf**, **requests**, **beautifulsoup4**, **aiohttp**, **img2pdf**, **numpy**, **opencv_python** и **pillow**.

Установка модулей: `python3 -m pip install -r requirements.txt`

## Использование
Expand Down
146 changes: 127 additions & 19 deletions booklead.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@
import os
import re
import urllib.parse

import asyncio
from aiohttp import ClientSession,TCPConnector, ClientTimeout
import datetime
import time
import numpy as np
#import nest_asyncio #used for debugging
from util import CV2_Russian, BinaryToDecimal,number_of_images, Postprocess, Time_Processing
import cv2
import random
import img2pdf
from bs4 import BeautifulSoup

import sys
from util import get_logger
from util import md5_hex, to_float, cut_bom, perror, progress, ptext, safe_file_name, Browser, select_one_text_optional
from util import select_one_text_required, select_one_attr_required, gwar_fix_json

from util import select_one_text_required, select_one_attr_required, gwar_fix_json,mkdirs_for_regular_file
from util import user_agents
import logging
log = get_logger(__name__)

BOOK_DIR = 'books'
Expand All @@ -22,7 +31,25 @@
}

prlDl_params = {
'ext': 'jpg'
'ext': 'jpg' }

headers_pr1 = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'If-Modified-Since': 'Tue, 20 Dec 2016 02:17:59 GMT',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
'dnt': '1',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136", "Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-gpc': '1',
}

bro: Browser
Expand All @@ -47,28 +74,59 @@ def saveImage(url, img_id, folder, ext, referer):
expected_ct = re.compile('image/')
bro.download(url, image_path, headers, content_type=expected_ct, skip_if_file_exists=True)


async def fetch_image(url: str, i,queue, headers_pr1, sem):
""" Не добавляйте в util.py, у меня тогда asyncio не работал (может баг на моей стороне)
по url скачиваю картинку и добавляю в binary в queue asyncio
"""
async with sem:
async with ClientSession(headers=headers_pr1,timeout=ClientTimeout(total=5),trust_env=True) as session: #,trust_env=True
async with session.get(url) as response:
result = (i, await response.read())
await queue.put(result)

async def async_images(url,num,headers_pr1):
"""Не добавляйте в util.py, у меня тогда asyncio не работал (может баг на моей стороне)
call every tile image to download in async mode (in the end, add binary with the image number to results_prlDl
"""

sem = asyncio.Semaphore(100)##https://stackoverflow.com/questions/63347818/aiohttp-client-exceptions-clientconnectorerror-cannot-connect-to-host-stackover
queue = asyncio.Queue()
async with asyncio.TaskGroup() as group: #https://blog.csdn.net/y662225dd/article/details/135273140
for i in range(num):
headers_pr1.update({'User-Agent': random.choice(user_agents)})
group.create_task(fetch_image(url.format(i), i,queue,headers_pr1,sem))

global results_prlDl
results_prlDl=[]
while not queue.empty():
results_prlDl.append(await queue.get())

def eshplDl(url):
ext = eshplDl_params['ext']
quality = eshplDl_params['quality']
domain = urllib.parse.urlsplit(url).netloc

html_text = bro.get_text(url)

soup = BeautifulSoup(html_text, 'html.parser')
title = select_one_text_optional(soup, 'title') or md5_hex(url)
title = safe_file_name(title)

for script in soup.findAll('script'):


st = str(script)

if 'initDocview' in st:
book_json = json.loads(st[st.find('{"'): st.find(')')])
ptext(f' Каталог для загрузки: {title}')
ptext(f' Каталог для загрузки: {title}')
pages = book_json['pages']
for idx, page in enumerate(pages):
img_url = f'http://{domain}/pages/{page["id"]}/zooms/{quality}'
saveImage(img_url, idx + 1, title, ext, url)
progress(f' Прогресс: {idx + 1} из {len(pages)} стр.')
progress(f' Прогресс: {idx + 1} из {len(pages)} стр.')
return title, ext


def prlDl(url):
"""
Expand All @@ -81,20 +139,70 @@ def prlDl(url):
soup = BeautifulSoup(html_text, 'html.parser')
title = select_one_text_optional(soup, 'h1') or md5_hex(url)
title = safe_file_name(title)
ptext(f' ─ Каталог для загрузки: {title}')
for script in soup.findAll('script'):
ptext(f'Каталог для загрузки: {title}')

for script in soup.find_all('script'): #findAll deprecated
st = str(script)
if 'jQuery.extend' in st:
book_json = json.loads(st[st.find('{"'): st.find(');')])
book = book_json['diva']['1']['options']
if "item" in url.split("prlib.ru/")[1]: #case for https://www.prlib.ru/item/***
book = book_json['diva']['1']['options']
elif "node" in url.split("prlib.ru/")[1]: #case for https://www.prlib.ru/node/***
book = book_json['diva']['settings']
json_text = bro.get_text(book['objectData'])
book_data = json.loads(json_text)
pages = book_data['pgs']
num_of_pages_down=0 #for the time prediction
start=datetime.datetime.now()#for the time prediction
for idx, page in enumerate(pages):
img_url = 'https://content.prlib.ru/fcgi-bin/iipsrv.fcgi?FIF={}/{}&WID={}&CVT=jpeg'.format(
book['imageDir'], page['f'], page['d'][len(page['d']) - 1]['w'])
saveImage(img_url, idx + 1, title, ext, url)
progress(f' ─ Прогресс: {idx + 1} из {len(pages)} стр.')

img_url = 'https://content.prlib.ru/fcgi-bin/iipsrv.fcgi?FIF={}/{}&JTL={},'.format(
book['imageDir'], page['f'], page['m']) #поменял здесь немного вид урл, так как по частям качаю
# брал урл отсюда: https://iipimage.sourceforge.io/documentation/protocol
img_url+="{}"
width, height=number_of_images(page["d"][len(page['d']) - 1]['w'],page["d"][len(page['d']) - 1]['h'])

image_short = '%05d.%s' % (idx+1, ext)
image_path = os.path.join(BOOK_DIR, title, image_short)

# заменяю все фичи ручками (например тут skip_if_exists), которые были ранее доступны через функции
#(т.к. метод у меня скачивания немного другой)
if os.path.exists(image_path) and os.stat(image_path).st_size > 0:
log.info(f'Пропускаю скачанный файл: {image_path}')
progress(f' Прогресс: {idx + 1} из {len(pages)} стр. ')
else:
mkdirs_for_regular_file(image_path)
#nest_asyncio.apply() # нужен только чтобы async работал нормально в Jupyter ( https://pypi.org/project/nest-asyncio/)
# получить все данные с картиники:
global headers_pr1
headers_pr1.update({'Referer': url})

flag=True #для проверки на хороший requests
global results_prlDl
while flag: #just keep quering the connection
try:

asyncio.run(async_images(img_url,width*height,headers_pr1)) #Downgrade to 3.6.2 #Using Python 3.8 https://blog.csdn.net/y662225dd/article/details/135273140
#loop = asyncio.get_event_loop() #for old version of aiohttp: 3.6.2
#loop.run_until_complete(async_images(img_url,width*height,headers))
except Exception as Argument: #Error coding
time.sleep(1.0)

logging.exception("Error occurred in ASYNCIO")
else:
if len(results_prlDl)!=0:
flag=False

# просессить все данные и в конце вывести картинку
Postprocess(results_prlDl,width,height, image_path)

# Time Formatting/Prediction:
prog=datetime.datetime.now()-start
num_of_pages_down+=1
left=prog/num_of_pages_down*(len(pages)-(idx+1)) #based on the values before prediction
minutes, seconds = Time_Processing(left)
past_min, past_sec=Time_Processing(prog)
progress(f' Прогресс: {idx + 1} из {len(pages)} стр. | Прошло (мин:сек): {past_min}:{past_sec:02d} ;Осталось: {minutes}:{seconds:02d} ')
return title, ext


Expand Down Expand Up @@ -192,7 +300,7 @@ def gwarDL(url):

book_dir = ('{}_{}'.format(book_id, title))[0:224]

ptext(f' Каталог для загрузки: {book_dir}')
ptext(f' Каталог для загрузки: {book_dir}')
request_headers = {'referer': url}

json_text = bro.post_text(json_url, request_headers, request_data)
Expand All @@ -212,7 +320,7 @@ def gwarDL(url):
img_url = 'https://cdn.gwar.mil.ru/imagesfww/{}'.format( # либо ...ru/imageloadfull/
image_url)
saveImage(img_url, idx + 1, book_dir, ext, 'https://gwar.mil.ru/')
progress(f' Прогресс: {idx + 1} из {len(pages)} стр.')
progress(f' Прогресс: {idx + 1} из {len(pages)} стр.')
return title, ext


Expand Down Expand Up @@ -271,7 +379,7 @@ def main():
for url in urls:
load = download_book(url)
if load and args.pdf.lower() in ['y', 'yes']:
progress(' Создание PDF...')
progress(' Создание PDF...')
title, img_ext = load
img_folder_full = os.path.join(BOOK_DIR, title)
pdf_path = os.path.join(BOOK_DIR, f'{title}.pdf')
Expand Down
10 changes: 7 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
beautifulsoup4==4.9.0
img2pdf==0.3.6
requests==2.25.1
aiohttp==3.11.18
beautifulsoup4==4.13.4
img2pdf==0.6.1
numpy==2.2.5
opencv_python==4.11.0.86
Requests==2.32.3
pillow==11.2.1
67 changes: 63 additions & 4 deletions util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,22 @@
import functools
import hashlib
import logging
import numpy as np
import os
import random
import re
import shutil
import sys
import time

import datetime
import cv2
import requests
from bs4 import Tag
from requests import Response
from typing import Dict, Optional, Pattern, Union
import json


user_agents = [
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
Expand All @@ -27,6 +30,7 @@
'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0'
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36'
]


Expand Down Expand Up @@ -89,8 +93,63 @@ def mkdirs_for_regular_file(filename: str):
except OSError as e: # Guard against race condition
if e.errno != errno.EEXIST:
raise


def Time_Processing(timedelta):
"""Чтоб время показывать
"""
minutes, seconds = divmod(round(timedelta.total_seconds()), 60)
return minutes, seconds

def Postprocess(results_prlDl,width, height,image_path):
"""
Прохожу через бинарные данные в results_prlDl, ставлю их на правильные места в картинке исходной и вывожу все в файл, напртмер 0001.jpg
"""
Total_Image=[i for i in range(len(results_prlDl))]
for item in results_prlDl:
Total_Image[item[0]]=BinaryToDecimal(item[1],os.path.dirname(image_path))

os.remove(os.path.join(os.path.dirname(image_path), "test.jpg"))
regroup=[]
for h in range(height):
regroup.append(Total_Image[h*width:(h+1)*width])
im_h=cv2.vconcat([cv2.hconcat(item) for item in regroup])

#cv2.imwrite(image_path, im_h) (doesn't work with Russian)
result, data = cv2.imencode('.jpg', im_h)
fh = open(image_path, 'wb')
fh.write(data)
fh.close()
def number_of_images(width, height):
"""
получаю кол-во картинок по ширине и длине (возможно можно в одну строчку как-то:)
"""
num_w=width//256
if width%256!=0:
num_w+=1
num_h=height//256
if height%256!=0:
num_h+=1
return int(num_w),int(num_h)

def BinaryToDecimal(binary,image_path):
"""
тупой вариант перевода binary в decimal для картинки. остальные способы казались слишком)
"""
with open(os.path.join(image_path, "test.jpg"), "wb") as file:
file.write(binary)
dec=CV2_Russian(os.path.join(image_path, "test.jpg")) # название папки на Русском в названии мешало прочитать cv2 файл (это окалаось известный баг cv2)
return dec
def CV2_Russian(name):
"""
Чтение картинки с русским названием в пути в cv2
#https://answers.opencv.org/question/205345/imread-and-russian-language-path-to-img/
"""
f = open(name, "rb")
chunk = f.read()
chunk_arr = np.frombuffer(chunk, dtype=np.uint8)
img = cv2.imdecode(chunk_arr, cv2.IMREAD_COLOR)
return img


def cut_bom(s: str):
bom = codecs.BOM_UTF8.decode("utf-8")
return s[len(bom):] if s.startswith(bom) else s
Expand All @@ -115,7 +174,7 @@ def gwar_fix_json(s: str, a: bool = False) -> str:
s = s.replace("'", '"')
if a:
# https://stackoverflow.com/questions/50947760/how-to-fix-json-key-values-without-double-quotes
s = re.sub("(\w+):", r'"\1":', s)
s = re.sub(r"(\w+):", r'"\1":', s) #added r: https://stackoverflow.com/questions/50504500/deprecationwarning-invalid-escape-sequence-what-to-use-instead-of-d
json_s = json.loads(s)
return json_s

Expand Down