personal-finance-database/notebooks/pdf_data_extraction_test.ipynb

126 lines
3.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b878d67d-c5a8-4817-99f3-4179edfd535a",
"metadata": {
"ExecuteTime": {
"end_time": "2023-07-05T16:32:58.644004+00:00",
"start_time": "2023-07-05T16:32:58.438814+00:00"
},
"noteable": {
"cell_type": "code",
"output_collection_id": "accfea1e-ca89-444f-9133-f39b76f10c8e"
}
},
"outputs": [],
"source": [
"from pdfminer.high_level import extract_text\n",
"import tabula\n",
"\n",
"class PdfParser:\n",
" def __init__(self, file_path):\n",
" self.file_path = file_path\n",
"\n",
" def extract_text(self):\n",
" text = extract_text(self.file_path)\n",
" return text\n",
"\n",
" def extract_table(self):\n",
" tables = tabula.read_pdf(self.file_path, pages='all')\n",
" return tables"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "202c2c7a-a22f-468d-a67e-e10638fd96a2",
"metadata": {
"noteable": {
"cell_type": "markdown"
}
},
"source": [
"Now, let's test the `PdfParser` class with a sample PDF file. Please replace `sample.pdf` with your actual PDF file path."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "76db2644",
"metadata": {},
"outputs": [],
"source": [
"# pdf_file_path = r\"..\\data\\raw\\chris' statements\\savings\\sc\\sc_savings_eStatement_202306.pdf\"\n",
"pdf_file_path = r\"..\\data\\raw\\chris' statements\\credit cards\\dbs\\dbs_card_9007_eStatement_202211.pdf\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e91a1abd-de76-4a12-adc5-fec5fd1e4301",
"metadata": {
"noteable": {
"cell_type": "code"
}
},
"outputs": [],
"source": [
"# replace 'sample.pdf' with your actual PDF file path\n",
"pdf_parser = PdfParser(pdf_file_path)\n",
"\n",
"# extract text\n",
"text = pdf_parser.extract_text()\n",
"print(text)\n",
"\n",
"# extract tables\n",
"tables = pdf_parser.extract_table()\n",
"for table in tables:\n",
" display(table)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60900e46",
"metadata": {},
"outputs": [],
"source": [
"len(tables)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "finance",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"noteable": {
"last_transaction_id": "73ebba71-e2fd-4f44-9b68-dbc9778c8fc8"
},
"noteable-chatgpt": {
"create_notebook": {
"openai_conversation_id": "179ca6e3-0377-5e6e-8f81-719779d73690",
"openai_ephemeral_user_id": "ace69c27-3d03-5a21-855f-72ec4b037401",
"openai_subdivision1_iso_code": "TW-TPE"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}