{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "b878d67d-c5a8-4817-99f3-4179edfd535a", "metadata": { "ExecuteTime": { "end_time": "2023-07-05T16:32:58.644004+00:00", "start_time": "2023-07-05T16:32:58.438814+00:00" }, "noteable": { "cell_type": "code", "output_collection_id": "accfea1e-ca89-444f-9133-f39b76f10c8e" } }, "outputs": [], "source": [ "from pdfminer.high_level import extract_text\n", "import tabula\n", "\n", "class PdfParser:\n", " def __init__(self, file_path):\n", " self.file_path = file_path\n", "\n", " def extract_text(self):\n", " text = extract_text(self.file_path)\n", " return text\n", "\n", " def extract_table(self):\n", " tables = tabula.read_pdf(self.file_path, pages='all')\n", " return tables" ] }, { "attachments": {}, "cell_type": "markdown", "id": "202c2c7a-a22f-468d-a67e-e10638fd96a2", "metadata": { "noteable": { "cell_type": "markdown" } }, "source": [ "Now, let's test the `PdfParser` class with a sample PDF file. Please replace `sample.pdf` with your actual PDF file path." ] }, { "cell_type": "code", "execution_count": 2, "id": "76db2644", "metadata": {}, "outputs": [], "source": [ "# pdf_file_path = r\"..\\data\\raw\\chris' statements\\savings\\sc\\sc_savings_eStatement_202306.pdf\"\n", "pdf_file_path = r\"..\\data\\raw\\chris' statements\\credit cards\\dbs\\dbs_card_9007_eStatement_202211.pdf\"" ] }, { "cell_type": "code", "execution_count": null, "id": "e91a1abd-de76-4a12-adc5-fec5fd1e4301", "metadata": { "noteable": { "cell_type": "code" } }, "outputs": [], "source": [ "# replace 'sample.pdf' with your actual PDF file path\n", "pdf_parser = PdfParser(pdf_file_path)\n", "\n", "# extract text\n", "text = pdf_parser.extract_text()\n", "print(text)\n", "\n", "# extract tables\n", "tables = pdf_parser.extract_table()\n", "for table in tables:\n", " display(table)" ] }, { "cell_type": "code", "execution_count": null, "id": "60900e46", "metadata": {}, "outputs": [], "source": [ "len(tables)" ] } ], "metadata": { "kernelspec": { "display_name": "finance", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "noteable": { "last_transaction_id": "73ebba71-e2fd-4f44-9b68-dbc9778c8fc8" }, "noteable-chatgpt": { "create_notebook": { "openai_conversation_id": "179ca6e3-0377-5e6e-8f81-719779d73690", "openai_ephemeral_user_id": "ace69c27-3d03-5a21-855f-72ec4b037401", "openai_subdivision1_iso_code": "TW-TPE" } } }, "nbformat": 4, "nbformat_minor": 5 }