# install: pip install PyMuPDF
import fitz
import re
import os
import pandas as pd
# get all filenames in a relative path
file_dir = "papers/"
fnames = []
for filename in os.listdir(file_dir):
# only pdf files
if filename.endswith(".pdf"):
print(filename)
fnames.append(filename)
# define a function that accepts the path for pdf and return the content and pagenumber
def textget(filepath):
# load the pdf
doc = fitz.open(filepath)
# get the number of page
num_pages = doc.pageCount
all_content = []
# ingore the cover page, i.e., the first page
for pageidx in range(1, num_pages):
all_content.append(doc.loadPage(pageidx).getText("text"))
# only keep alphabets
clean_content = list(map(lambda x: re.sub(r'[^A-Za-z]+', ' ', x), all_content))
# convert list of words into a whole string
final_text = ' '.join(clean_content)
return num_pages, final_text.strip()
# define two lists to store num of pages and text for each report
num_list = []
text_list = []
# loop over the folder storing all report
for fn in fnames:
# get the path: parent folder + filename
fpath = file_dir + fn
num_page, text = textget(fpath)
num_list.append(num_page)
text_list.append(text)
# create pandas dataframe
dict_report = {'text': text_list, 'numpage': num_list}
df_report = pd.DataFrame.from_dict(dict_report)
df_report.head()
df_report.to_csv("reprots_2018.csv", index=None)