Sunday, July 28, 2024

Merge PDF (Merging to new document)

# pip install pypdf


def mergePDFs(inFile1, inFile2, outFile):

  writer = PdfWriter()


  reader1 = PdfReader(inFile1)

  for page in reader1.pages:

    writer.add_page(page)


  reader2 = PdfReader(inFile2)

  for page in reader2.pages:

    writer.add_page(page)


  with open(outFile, 'wb') as output:

    writer.write(output)


inFile1 = 'Business_Proposal.pdf'

inFile2 = 'Some_New_Doc.pdf'

outFile = 'Merged.pdf'


mergePDFs(inFile1, inFile2, outFile)


Display PDF text on Python terminal

# pip install pypdf


def extractFromPage1(innFile, pageNumbers):

    reader = PdfReader(innFile)

    for pageNumber in pageNumbers:

      if pageNumber < len(reader.pages):

        page = reader.pages[pageNumber]

        text = page.extract_text()

        print(text)

        input("Press any key to continue\n")

      else: # If page does not exist

        print(f"page {pageNumber} does not exist")

        input("Press any key to continue\n")


innFile = 'US_Declaration.pdf'

extractFromPage1(innFile, [7, 2]) # Page 7 does not exist in file

Convert PDF to Word

# pip install pdf2docx


from pdf2docx import Converter


def pdf_to_word(inputFile, outputFile):

  cv = Converter(inputFile)

  cv.convert(outputFile, start=0, end=None)

  cv.close()


innputFile = 'US_Declaration.pdf'

outputFile = 'US_Declaration.docx'

pdf_to_word(innputFile, outputFile)


Extract pages from PDF using Python

# This code was suggested by Gemini and runs on Colab

!pip install PyPDF2==3.0.1

from PyPDF2 import PdfReader, PdfWriter

def extract_pages(input_pdf, output_pdf, pages):
    pdf_reader = PdfReader(input_pdf)
    pdf_writer = PdfWriter()

    for page_num in pages:
        page = pdf_reader.pages[page_num]
        pdf_writer.add_page(page)

    with open(output_pdf, 'wb') as out_file:
        pdf_writer.write(out_file)

input_pdf_file = '/content/sample_data/Emp Skills IX.pdf'
output_pdf_file = '/content/sample_data/Green IX.pdf'
#                              +5,   +6
pages_to_extract = list(range(175, 206))

extract_pages(input_pdf_file, output_pdf_file, pages_to_extract)

===========================================================================

# This code suggested by ChatGPT & had problems on IDLE

# pip install pypdf

from PyPDF2 import PdfReader, PdfWriter


def extract_pages(inputFile, outputFile, pages):

  reader = PdfReader(inputFile)

  writer = PdfWriter()

  for page_number in pages:

    writer.add_page(reader.pages[page_number])

  with open(outputFile, 'wb') as out:

    writer.write(out)


innFile = 'US_Declaration.pdf'

outFile = 'US_Declaration1.pdf'

pages = [i for i in range(2)] # [0, 1]

extract_pages(innFile, outFile, pages)