2023-07-12 14:53:16 +00:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 1,
|
|
|
|
"id": "b878d67d-c5a8-4817-99f3-4179edfd535a",
|
|
|
|
"metadata": {
|
|
|
|
"ExecuteTime": {
|
|
|
|
"end_time": "2023-07-05T16:32:58.644004+00:00",
|
|
|
|
"start_time": "2023-07-05T16:32:58.438814+00:00"
|
|
|
|
},
|
|
|
|
"noteable": {
|
|
|
|
"cell_type": "code",
|
|
|
|
"output_collection_id": "accfea1e-ca89-444f-9133-f39b76f10c8e"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"from pdfminer.high_level import extract_text\n",
|
|
|
|
"import tabula\n",
|
|
|
|
"\n",
|
|
|
|
"class PdfParser:\n",
|
|
|
|
" def __init__(self, file_path):\n",
|
|
|
|
" self.file_path = file_path\n",
|
|
|
|
"\n",
|
|
|
|
" def extract_text(self):\n",
|
|
|
|
" text = extract_text(self.file_path)\n",
|
|
|
|
" return text\n",
|
|
|
|
"\n",
|
|
|
|
" def extract_table(self):\n",
|
|
|
|
" tables = tabula.read_pdf(self.file_path, pages='all')\n",
|
|
|
|
" return tables"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"attachments": {},
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"id": "202c2c7a-a22f-468d-a67e-e10638fd96a2",
|
|
|
|
"metadata": {
|
|
|
|
"noteable": {
|
|
|
|
"cell_type": "markdown"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"source": [
|
|
|
|
"Now, let's test the `PdfParser` class with a sample PDF file. Please replace `sample.pdf` with your actual PDF file path."
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-07-30 17:45:41 +00:00
|
|
|
"execution_count": 2,
|
2023-07-12 14:53:16 +00:00
|
|
|
"id": "76db2644",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-07-30 17:45:41 +00:00
|
|
|
"# pdf_file_path = r\"..\\data\\raw\\chris' statements\\savings\\sc\\sc_savings_eStatement_202306.pdf\"\n",
|
|
|
|
"pdf_file_path = r\"..\\data\\raw\\chris' statements\\credit cards\\dbs\\dbs_card_9007_eStatement_202211.pdf\""
|
2023-07-12 14:53:16 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-07-30 17:45:41 +00:00
|
|
|
"execution_count": null,
|
2023-07-12 14:53:16 +00:00
|
|
|
"id": "e91a1abd-de76-4a12-adc5-fec5fd1e4301",
|
|
|
|
"metadata": {
|
|
|
|
"noteable": {
|
|
|
|
"cell_type": "code"
|
|
|
|
}
|
|
|
|
},
|
2023-07-30 17:45:41 +00:00
|
|
|
"outputs": [],
|
2023-07-12 14:53:16 +00:00
|
|
|
"source": [
|
|
|
|
"# replace 'sample.pdf' with your actual PDF file path\n",
|
|
|
|
"pdf_parser = PdfParser(pdf_file_path)\n",
|
|
|
|
"\n",
|
|
|
|
"# extract text\n",
|
|
|
|
"text = pdf_parser.extract_text()\n",
|
|
|
|
"print(text)\n",
|
|
|
|
"\n",
|
|
|
|
"# extract tables\n",
|
|
|
|
"tables = pdf_parser.extract_table()\n",
|
|
|
|
"for table in tables:\n",
|
|
|
|
" display(table)"
|
|
|
|
]
|
2023-07-30 17:45:41 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"id": "60900e46",
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"len(tables)"
|
|
|
|
]
|
2023-07-12 14:53:16 +00:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "finance",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.11.3"
|
|
|
|
},
|
|
|
|
"noteable": {
|
|
|
|
"last_transaction_id": "73ebba71-e2fd-4f44-9b68-dbc9778c8fc8"
|
|
|
|
},
|
|
|
|
"noteable-chatgpt": {
|
|
|
|
"create_notebook": {
|
|
|
|
"openai_conversation_id": "179ca6e3-0377-5e6e-8f81-719779d73690",
|
|
|
|
"openai_ephemeral_user_id": "ace69c27-3d03-5a21-855f-72ec4b037401",
|
|
|
|
"openai_subdivision1_iso_code": "TW-TPE"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 5
|
|
|
|
}
|