不要嚴格判斷,我是自學成才的初學者)))
請幫助我弄清楚如何分享我在 PyPDF2 和 PyMuPDF (fitz) 的幫助下學到的知識。但是拆分的時候經常會出現只有四分之一有文本的情況,但是它把4個四分之一都寫到了新檔案中,有文本也有空,一個有文本,其余都是空的,我需要一些東西讓那個空的沒有保存,我想以某種方式進行檢查,但沒有成功,缺乏知識。我試圖讀取新錄制的檔案并洗掉空白頁,但是每一頁都有文字,即使是在空白頁上,我在acrobat reader中打開檔案,但是頁面是空的,我不明白如何。
這是我的代碼,以防萬一我怎么做:https ://paste.aiogram.dev/opiquhehus.py
這是我第一次在這里發帖,我不知道如何附加檔案。pdf 檔案,例如在電報頻道:https ://t.me/ Tq7WpP1ImcjQXSZF 。
import copy
import logging
import random
from pathlib import Path
import PyPDF2
import fitz
from PyPDF2.filters import decodeStreamData, ASCII85Decode
from PyPDF2.generic import EncodedStreamObject, DecodedStreamObject
def from_a4_to_a6_not_sync(input_file, output_file):
input_file = str(input_file.absolute())
pdf_reader = PyPDF2.PdfFileReader(input_file)
# print(f'{pdf_reader.getNumPages()=}')
# print(f'{pdf_reader.documentInfo=}')
first_page = pdf_reader.getPage(0)
left_up_side = copy.deepcopy(first_page)
right_up_side = copy.deepcopy(first_page)
left_down_side = copy.deepcopy(first_page)
right_down_side = copy.deepcopy(first_page)
# print(f'{left_up_side.extractText()=}')
# print(f'{right_up_side.extractText()=}')
# print(f'\nДО ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nДО ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
# second_page = pdf_reader.getPage(0)
# print(f'{type(second_page)=}\n{second_page.extractText()=}')
# third_page = pdf_reader.getPage(0)
# fourth_page = pdf_reader.getPage(0)
first_coord = first_page.mediaBox.upperRight[0]
second_coord = first_page.mediaBox.upperRight[1]
# print(f'{first_coord=}')
# print(f'{second_coord=}')
# cords_upperLeft = first_page.mediaBox.upperLeft
# cords_lowerLeft = first_page.mediaBox.lowerLeft
# cords_upperRight = first_page.mediaBox.upperRight
# cords_lowerRight = first_page.mediaBox.lowerRight
# print(f'{cords_upperLeft=}')
# print(f'{cords_lowerLeft=}')
# print(f'{cords_upperRight=}')
# print(f'{cords_lowerRight=}')
# first_page.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# second_page.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# third_page.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
# fourth_page.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_up_side.mediaBox.lowerRight = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_up_side.mediaBox.lowerLeft = (first_coord / 2, second_coord / 2) # ВЕРХНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
left_down_side.mediaBox.upperRight = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ЛЕВАЯ ЧЕТВЕРТИНКА
right_down_side.mediaBox.upperLeft = (first_coord / 2, second_coord / 2) # НИЖНЯЯ ПРАВАЯ ЧЕТВЕРТИНКА
# print(f'{first_page=}\n\n')
# one_page = left_up_side.getContents()
# second_page = right_up_side.getContents()
# decode_one = DecodedStreamObject()
# print(f'{decode_one.getData()}')
# print(f'{decodeStreamData(second_page)}')
# print(f'ПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'{left_up_side.extractText().encode("utf8")=} {type(left_up_side.extractText())=}')
# print(f'{right_up_side.extractText().encode("utf8")=} {type(right_up_side.extractText())=}')
# print(f'{left_up_side.getContents()=} {type(left_up_side.getContents())=}')
# print(f'{right_up_side.getContents()=} {type(right_up_side.getContents())=}')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(left_up_side)=}\n{left_up_side=}\n')
# print(f'\nПОСЛЕ ОБРЕЗКИ:\n{type(right_up_side)=}\n{right_up_side=}\n')
pdf_writer = PyPDF2.PdfFileWriter()
# pdf_writer.addPage(first_page)
pdf_writer.addPage(left_up_side)
pdf_writer.addPage(right_up_side)
with open(output_file, 'wb') as file:
pdf_writer.write(file)
file.close()
def fitz_four_piaces(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
page = 0
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
count = 0 # почему-то не считает
rx = d # add the CropBox displacement
# print(f'{rx=}')
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the image
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# print(f'{spage.number=}')
# text_in_page = page.get_text("text")#.encode("utf8")
# print(f'{text_in_page=}')
# print(f'{count=} {doc.get_page_text(doc.page_count - 1)=}')
# print(f'in cicle {doc.page_count - 1=}')
count = 1
# that's it, save output file
# print(f'{doc.metadata=}')
# print(f'{doc.page_count=}')
doc.save(output_file, #
garbage=3, # eliminate duplicate objects
deflate=True, # compress stuff where possible
)
# input_file2 = str(output_file.absolute())
# src2 = fitz.open(input_file2)
# print(f'{src2.page_count=}')
# for page in src2:
# print(f'{page.get_text("words")=}')
def fitz_four_piaces_read(input_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
print(f'{src.page_count=}')
for page in src:
print(f'{page.get_text("text")=}')
destination = Path().joinpath("MAKETS")
destination.mkdir(parents=True, exist_ok=True)
destination_input = destination.joinpath(
f'up_lef.pdf') # up_lef_up_rig_low_lef_low_rig
destination_output = destination.joinpath(
f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf') # f'output_a6_{random.randint(1, 100)}_{random.randint(1, 200)}.pdf'
# from_a4_to_a6_not_sync(destination_input, destination_output)
fitz_four_piaces(destination_input, destination_output)
fitz_four_piaces_read(destination_output)
uj5u.com熱心網友回復:
找到解決方案!需要將頁面分成4部分后,將生成的頁面轉換成圖片,然后比較大小。我將分享代碼,也許它會對某人有用)
import os
import fitz
def get_size(filename):
st = os.stat(filename)
return st.st_size
async def from_a4_to_a6(input_file, output_file):
input_file = str(input_file.absolute())
src = fitz.open(input_file)
doc = fitz.open() # empty output PDF
for spage in src: # for each page in input
r = spage.rect # input page rectangle
d = fitz.Rect(spage.cropbox_position, # CropBox displacement if not
spage.cropbox_position) # starting at (0, 0)
# --------------------------------------------------------------------------
# example: cut input page into 2 x 2 parts
# --------------------------------------------------------------------------
r1 = r / 2 # top left rect
r2 = r1 (r1.width, 0, r1.width, 0) # top right rect
r3 = r1 (0, r1.height, 0, r1.height) # bottom left rect
r4 = fitz.Rect(r1.br, r.br) # bottom right rect
rect_list = [r1, r2, r3, r4] # put them in a list
for rx in rect_list: # run thru rect list
rx = d # add the CropBox displacement
page = doc.new_page(-1, # new output page with rx dimensions
width=rx.width,
height=rx.height)
page.show_pdf_page(
page.rect, # fill all new page with the imageb
src, # input document
spage.number, # input page number
clip=rx, # which part to use of input page
)
# Here we will convert the pdf to an image and check the size
pix = page.get_pixmap() # render page to an image
name_png = f"page-{page.number}.png" # _{random.randint(1,100)}
pix.save(name_png) # store image as a PNG
imgsize = get_size(name_png)
os.remove(name_png)
if imgsize < 1300: # A6 blank page size approximately 1209 Yours may be different, check first
doc.delete_page(pno=-1)
break
doc.save(output_file,
garbage=4, # eliminate duplicate objects
clean=True,
deflate=True, # compress stuff where possible
)
轉載請註明出處,本文鏈接:https://www.uj5u.com/qukuanlian/462036.html
