# install: pip install PyMuPDF
import fitz
import re
import os
import pandas as pd
# get all filenames in a relative path
file_dir = "papers/"
fnames = []
for filename in os.listdir(file_dir):
    # only pdf files
    if filename.endswith(".pdf"): 
        print(filename)
        fnames.append(filename)
# define a function that accepts the path for pdf and return the content and pagenumber
def textget(filepath):
    # load the pdf
    doc = fitz.open(filepath)
    # get the number of page
    num_pages = doc.pageCount
    all_content = []
    # ingore the cover page, i.e., the first page
    for pageidx in range(1, num_pages):
        all_content.append(doc.loadPage(pageidx).getText("text"))
    # only keep alphabets
    clean_content = list(map(lambda x: re.sub(r'[^A-Za-z]+', ' ', x), all_content))
    # convert list of words into a whole string
    final_text = ' '.join(clean_content)
    return num_pages, final_text.strip()
# define two lists to store num of pages and text for each report
num_list = []
text_list = []
# loop over the folder storing all report
for fn in fnames:
    # get the path: parent folder + filename
    fpath = file_dir + fn
    num_page, text = textget(fpath)
    num_list.append(num_page)
    text_list.append(text)
# create pandas dataframe
dict_report = {'text': text_list, 'numpage': num_list}
df_report = pd.DataFrame.from_dict(dict_report)
df_report.head()
df_report.to_csv("reprots_2018.csv", index=None)