PDF Extraction

Tasks

  1. Using PyMuPDF to extract text and number of pages from 2018/19 MSBA project reports in pdf format.

  2. Filter all non alphabetic char. using Regular Expression.

  3. Preprocess all reports and convert them into a well-structured csv file

In [1]:
# install: pip install PyMuPDF
import fitz
import re
import os
import pandas as pd
In [2]:
# get all filenames in a relative path
file_dir = "papers/"
fnames = []
for filename in os.listdir(file_dir):
    # only pdf files
    if filename.endswith(".pdf"): 
        print(filename)
        fnames.append(filename)
report01.pdf
report02.pdf
report03.pdf
report04.pdf
report05.pdf
report06.pdf
report07.pdf
report08.pdf
report09.pdf
report10.pdf
report11.pdf
report12.pdf
report13.pdf
report14.pdf
In [3]:
# define a function that accepts the path for pdf and return the content and pagenumber
def textget(filepath):
    # load the pdf
    doc = fitz.open(filepath)
    # get the number of page
    num_pages = doc.pageCount
    all_content = []
    # ingore the cover page, i.e., the first page
    for pageidx in range(1, num_pages):
        all_content.append(doc.loadPage(pageidx).getText("text"))
    # only keep alphabets
    clean_content = list(map(lambda x: re.sub(r'[^A-Za-z]+', ' ', x), all_content))
    # convert list of words into a whole string
    final_text = ' '.join(clean_content)
    return num_pages, final_text.strip()
In [4]:
# define two lists to store num of pages and text for each report
num_list = []
text_list = []
# loop over the folder storing all report
for fn in fnames:
    # get the path: parent folder + filename
    fpath = file_dir + fn
    num_page, text = textget(fpath)
    num_list.append(num_page)
    text_list.append(text)
In [5]:
# create pandas dataframe
dict_report = {'text': text_list, 'numpage': num_list}
df_report = pd.DataFrame.from_dict(dict_report)
In [6]:
df_report.head()
Out[6]:
text numpage
0 Table of Contents Introduction Problem Definit... 16
1 Table of Contents Introduction Background Prob... 11
2 Table of Contents INTRODUCTION DATA COLLECTION... 20
3 INTRODUCTION As a platform that connects peopl... 12
4 Table of Contents Section Overview and Researc... 15
In [7]:
df_report.to_csv("reprots_2018.csv", index=None)