PDF Extraction¶

Tasks¶

Using PyMuPDF to extract text and number of pages from 2018/19 MSBA project reports in pdf format.
Filter all non alphabetic char. using Regular Expression.
Preprocess all reports and convert them into a well-structured csv file

# install: pip install PyMuPDF
import fitz
import re
import os
import pandas as pd

# get all filenames in a relative path
file_dir = "papers/"
fnames = []
for filename in os.listdir(file_dir):
    # only pdf files
    if filename.endswith(".pdf"): 
        print(filename)
        fnames.append(filename)

report01.pdf
report02.pdf
report03.pdf
report04.pdf
report05.pdf
report06.pdf
report07.pdf
report08.pdf
report09.pdf
report10.pdf
report11.pdf
report12.pdf
report13.pdf
report14.pdf

# define a function that accepts the path for pdf and return the content and pagenumber
def textget(filepath):
    # load the pdf
    doc = fitz.open(filepath)
    # get the number of page
    num_pages = doc.pageCount
    all_content = []
    # ingore the cover page, i.e., the first page
    for pageidx in range(1, num_pages):
        all_content.append(doc.loadPage(pageidx).getText("text"))
    # only keep alphabets
    clean_content = list(map(lambda x: re.sub(r'[^A-Za-z]+', ' ', x), all_content))
    # convert list of words into a whole string
    final_text = ' '.join(clean_content)
    return num_pages, final_text.strip()

# define two lists to store num of pages and text for each report
num_list = []
text_list = []
# loop over the folder storing all report
for fn in fnames:
    # get the path: parent folder + filename
    fpath = file_dir + fn
    num_page, text = textget(fpath)
    num_list.append(num_page)
    text_list.append(text)

# create pandas dataframe
dict_report = {'text': text_list, 'numpage': num_list}
df_report = pd.DataFrame.from_dict(dict_report)

df_report.head()

df_report.to_csv("reprots_2018.csv", index=None)

	text	numpage
0	Table of Contents Introduction Problem Definit...	16
1	Table of Contents Introduction Background Prob...	11
2	Table of Contents INTRODUCTION DATA COLLECTION...	20
3	INTRODUCTION As a platform that connects peopl...	12
4	Table of Contents Section Overview and Researc...	15