diff --git a/README.md b/README.md index 2a340df..63e7b16 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ + # Personal Finance Database -This project aims to manage personal finance data using a Google Sheets based database. It provides tools to import data from PDF statements, perform basic personal financial analysis, and visualize the data. +This project aims to manage personal finance data using a Google Sheets-based database. It provides tools to import data from PDF statements, perform basic personal financial analysis, and visualize the data. ## Features @@ -9,6 +10,44 @@ This project aims to manage personal finance data using a Google Sheets based da - **Data Analysis:** Conduct basic personal financial analysis. - **Data Visualization:** Create simple and understandable visualizations of financial data. +## Project Structure + +``` +personal-finance-database/ +├── .git +├── data/ +│ ├── raw/ +│ │ ├── your_file.pdf +│ ├── processed/ +│ └── external/ +├── src/ +│ ├── __init__.py +│ ├── data_ingestion/ +│ │ ├── __init__.py +│ │ ├── google_sheets_api.py +│ │ └── pdf_parser.py +│ ├── data_processing/ +│ │ ├── __init__.py +│ │ └── data_cleaner.py +│ └── analysis_visualization/ +│ ├── __init__.py +│ ├── financial_analysis.py +│ └── data_visualization.py +├── tests/ +│ ├── __init__.py +│ ├── test_data_ingestion.py +│ ├── test_data_processing.py +│ └── test_analysis_visualization.py +├── notebooks/ +│ ├── notebook1.ipynb +│ ├── notebook2.ipynb +│ └── ... +├── docs/ +├── .gitignore +├── README.md +└── requirements.txt +``` + ## Installation Clone this repository to your local machine. @@ -27,15 +66,19 @@ Install the necessary packages. pip install -r requirements.txt ``` ## Usage -[Provide instructions on how to use the project. This should include code examples and explanations of the different components.] + +Details on how to use the project will be updated as the project progresses. ## Contributing + Pull requests are welcome. For major changes, please open an issue first to discuss what you would like to change. ## License + [Choose an open source license and mention it here.] ## Contact + [Your Name] - [Your Email] - [Your LinkedIn/GitHub/Twitter etc.] -Remember to replace the placeholders with your actual details. You should also include a more detailed explanation in the "Usage" section once you have more functionality built out. +Remember to replace the placeholders with your actual details. diff --git a/notebooks/pdf_data_extraction_test.ipynb b/notebooks/pdf_data_extraction_test.ipynb new file mode 100644 index 0000000..80634df --- /dev/null +++ b/notebooks/pdf_data_extraction_test.ipynb @@ -0,0 +1,1522 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b878d67d-c5a8-4817-99f3-4179edfd535a", + "metadata": { + "ExecuteTime": { + "end_time": "2023-07-05T16:32:58.644004+00:00", + "start_time": "2023-07-05T16:32:58.438814+00:00" + }, + "noteable": { + "cell_type": "code", + "output_collection_id": "accfea1e-ca89-444f-9133-f39b76f10c8e" + } + }, + "outputs": [], + "source": [ + "from pdfminer.high_level import extract_text\n", + "import tabula\n", + "\n", + "class PdfParser:\n", + " def __init__(self, file_path):\n", + " self.file_path = file_path\n", + "\n", + " def extract_text(self):\n", + " text = extract_text(self.file_path)\n", + " return text\n", + "\n", + " def extract_table(self):\n", + " tables = tabula.read_pdf(self.file_path, pages='all')\n", + " return tables" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "202c2c7a-a22f-468d-a67e-e10638fd96a2", + "metadata": { + "noteable": { + "cell_type": "markdown" + } + }, + "source": [ + "Now, let's test the `PdfParser` class with a sample PDF file. Please replace `sample.pdf` with your actual PDF file path." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "76db2644", + "metadata": {}, + "outputs": [], + "source": [ + "pdf_file_path = r\"..\\data\\raw\\chris' statements\\savings\\sc\\sc_savings_eStatement_202306.pdf\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e91a1abd-de76-4a12-adc5-fec5fd1e4301", + "metadata": { + "noteable": { + "cell_type": "code" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Consolidated Statement\n", + "\n", + "Statement Date \n", + "\n", + ": 17 Jun 2023\n", + "\n", + "Page \n", + "\n", + ": 1 of 3\n", + "\n", + "WONG CHI YAN \n", + "UNIT B 3/F BLK 1\n", + "REGENTVILLE\n", + "8 WO MUN ST\n", + "FANLING NT\n", + "HONG KONG\n", + "\n", + "YOUR FINANCIAL STATUS \n", + "\n", + "AS AT STATEMENT DATE (IN HKD EQUIVALENT) \n", + "\n", + "1. DEPOSITS \n", + "\n", + "HKD Deposits \n", + "CNY Deposits \n", + "FX Deposits \n", + "\n", + "2. INVESTMENTS \n", + "\n", + "Securities \n", + "Investment Funds \n", + "Gold \n", + "Currency Trading \n", + "Debt Securities \n", + "Equity Linked Investment \n", + "\n", + "3. PERSONAL LOANS \n", + "\n", + "Instalment Loans \n", + "Overdrafts \n", + "Credit Cards \n", + "4. WEALTHPRO \n", + "\n", + "207 \n", + "\n", + "5. INVESTPOWER \n", + "\n", + "109,214.90\n", + "1,136.87\n", + "778.10\n", + "\n", + "0.00\n", + "0.00\n", + "0.00\n", + "0.00\n", + "0.00 \n", + "0.00 \n", + "\n", + "0.00\n", + "0.00\n", + "0.00\n", + "\n", + "111,129.87\n", + "\n", + "0.00\n", + "\n", + "0.00 \n", + "\n", + "0.00 \n", + "\n", + "0.00 \n", + "\n", + "PREMIUM PROGRAMME \n", + "\n", + "6. NET POSITION \n", + "\n", + " (1+2−3+4+5)\n", + "\n", + "111,129.87 \n", + "\n", + "7. INSURANCE \n", + "\n", + "8. MORTGAGES \n", + "\n", + "9. MPF/ORSO \n", + "\n", + "0.00 \n", + "\n", + "0.00 \n", + "\n", + "0.00 \n", + "\n", + "YOUR ACCOUNT BALANCES \n", + "\n", + "Account Type\n", + "\n", + "Account Number\n", + "\n", + "Currency Balance\n", + "\n", + "HKD Balance or equivalent\n", + "\n", + "INTEGRATED S/A (Bonus Payroll)\n", + "\n", + "368−8−118927−1\n", + "\n", + "Integrated Account − Current \n", + "\n", + "368−8−118927−0\n", + "\n", + "HKD \n", + "\n", + "CNY \n", + "\n", + "USD \n", + "\n", + "TOTAL\n", + "\n", + "HKD \n", + "\n", + "TOTAL\n", + "\n", + "107,220.22 \n", + "\n", + "1,044.92 \n", + "\n", + "100.00 \n", + "\n", + "1,994.68 \n", + "\n", + "107,220.22 \n", + "\n", + "1,136.87 \n", + "\n", + "778.10 \n", + "\n", + "109,135.19 \n", + "\n", + "1,994.68 \n", + "\n", + "1,994.68 \n", + "\n", + "With immediate effect, Standard Chartered Bank (Hong Kong) Limited will need customer consent in order to access and utilize their deposit−related information for providing investment /\n", + "wealth management services to them. If you would like to make such arrangement, please visit any of our branches to arrange.\n", + "\n", + "With effect from August 2018, the Relationship Balance of your personal account will include the MPF account balance under Manulife Global Select (MPF) Scheme where you authorise and\n", + "consent the Bank to receive your MPF account information. To obtain the request form, please visit sc.com/hk/help/download−centre/ after 6 August 2018.\n", + "\n", + "2018\n", + "\n", + "8\n", + "\n", + "8\n", + "\n", + "6\n", + "\n", + "sc.com/hk/help/download−centre/\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\fConsolidated Statement \n", + "\n", + "PREMIUM PROGRAMME\n", + "\n", + "YOUR ACCOUNT ACTIVITIES \n", + "\n", + "INTEGRATED S/A (BONUS PAYROLL) \n", + "WONG CHI YAN\n", + "\n", + "Statement Date \n", + "\n", + ":17 Jun 2023\n", + "\n", + "Page \n", + "\n", + ": 2 of 3\n", + "\n", + "WONG CHI YAN\n", + "\n", + ": 368−8−118927−1\n", + "\n", + "Date \n", + "\n", + "Description \n", + "\n", + "Deposit \n", + "\n", + "Withdrawal \n", + "\n", + "Balance\n", + "\n", + "HKD \n", + "\n", + "17 May\n", + "\n", + "BALANCE FROM PREVIOUS STATEMENT\n", + "\n", + "20 May\n", + "\n", + "25 May\n", + "\n", + "FPS QR MO S** P*** HK2305200062274383\n", + "\n", + "FPS QR HKT − TEL / EYE 26404890000118A46\n", + "\n", + "FPS QR (HKT) NETVIGATO 68112310619222A62\n", + "\n", + "SCB ATM QR WDL 0078 1223\n", + "\n", + "29 May\n", + "\n", + "WONG K** L*** 00−2305−282310196700\n", + "(Value Date As of 28 MAY)\n", + "\n", + "31 May\n", + "\n", + "CREDIT INTEREST NINT\n", + "\n", + "CLOUD LIGHT TECHNOLO\n", + "\n", + "01 Jun\n", + "\n", + "WONG PAK WING KATHY 00−2306−010004322000\n", + "\n", + "TRANSFER WITHDRAWAL NTRF\n", + "INTERNET BANKING\n", + "\n", + "TRANSFER WITHDRAWAL NTRF\n", + "INTERNET BANKING\n", + "\n", + "05 Jun\n", + "\n", + "BILL PAY−B−01 5408047948319007\n", + "INTERNET BANKING\n", + "\n", + "TRANSFER WITHDRAWAL NTRF\n", + "INTERNET BANKING\n", + "\n", + "TRANSFER WITHDRAWAL NTRF\n", + "INTERNET BANKING\n", + "\n", + "06 Jun\n", + "\n", + "YU FUNG SHING 0605PAYC0101085281768\n", + "(Value Date As of 05 JUN)\n", + "\n", + "07 Jun\n", + "\n", + "12 Jun\n", + "\n", + "13 Jun\n", + "\n", + "17 Jun\n", + "\n", + "MAN YUK FAI 0606PAYC0101086133910\n", + "\n", + "PRUDENTIAL HONG KONG\n", + "\n", + "SCB ATM QR WDL 0437 1657\n", + "\n", + "TRANSFER WITHDRAWAL NTRF\n", + "INTERNET BANKING\n", + "\n", + "SCB ATM QR WDL 0093 1954\n", + "\n", + "TRANSFER WITHDRAWAL NTRF\n", + "INTERNET BANKING\n", + "\n", + "SCB ATM QR WDL 0093 1706\n", + "\n", + "17 Jun\n", + "\n", + "CLOSING BALANCE\n", + "\n", + "CNY \n", + "\n", + "17 May\n", + "\n", + "BALANCE FROM PREVIOUS STATEMENT\n", + "\n", + "31 May\n", + "\n", + "17 Jun\n", + "\n", + "CREDIT INTEREST NINT\n", + "\n", + "CLOSING BALANCE\n", + "\n", + "USD \n", + "\n", + "17 May\n", + "\n", + "BALANCE FROM PREVIOUS STATEMENT\n", + "\n", + "17 Jun\n", + "\n", + "CLOSING BALANCE\n", + "\n", + "71.51\n", + "\n", + "46,031.00\n", + "\n", + "12,008.00\n", + "\n", + "275.00\n", + "\n", + "275.00\n", + "\n", + "0.08\n", + "\n", + "285.00\n", + "\n", + "166.00\n", + "\n", + "416.00\n", + "\n", + "900.00\n", + "\n", + "12,000.00\n", + "\n", + "12,000.00\n", + "\n", + "115,572.92\n", + "\n", + "115,287.92\n", + "\n", + "115,121.92\n", + "\n", + "114,705.92\n", + "\n", + "113,805.92\n", + "\n", + "101,805.92\n", + "\n", + "101,877.43\n", + "\n", + "147,908.43\n", + "\n", + "159,916.43\n", + "\n", + "147,916.43\n", + "\n", + "12,008.00\n", + "\n", + "135,908.43\n", + "\n", + "29.86\n", + "\n", + "489.06\n", + "\n", + "135,878.57\n", + "\n", + "135,389.51\n", + "\n", + "24,226.11\n", + "\n", + "111,163.40\n", + "\n", + "793.18\n", + "\n", + "900.00\n", + "\n", + "500.00\n", + "\n", + "900.00\n", + "\n", + "500.00\n", + "\n", + "900.00\n", + "\n", + "111,438.40\n", + "\n", + "111,713.40\n", + "\n", + "110,920.22\n", + "\n", + "110,020.22\n", + "\n", + "109,520.22\n", + "\n", + "108,620.22\n", + "\n", + "108,120.22\n", + "\n", + "107,220.22\n", + "\n", + "107,220.22\n", + "\n", + "1,044.84\n", + "\n", + "1,044.92\n", + "\n", + "1,044.92\n", + "\n", + "100.00\n", + "\n", + "100.00\n", + "\n", + "1,994.68\n", + "\n", + "1,994.68\n", + "\n", + "INTEGRATED ACCOUNT − CURRENT \n", + "WONG CHI YAN\n", + "\n", + ": 368−8−118927−0\n", + "\n", + "Date \n", + "\n", + "Description \n", + "\n", + "Deposit \n", + "\n", + "Withdrawal \n", + "\n", + "Balance\n", + "\n", + "HKD \n", + "\n", + "17 May\n", + "\n", + "BALANCE FROM PREVIOUS STATEMENT\n", + "\n", + "17 Jun\n", + "\n", + "CLOSING BALANCE\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\fConsolidated Statement \n", + "\n", + "PREMIUM PROGRAMME\n", + "\n", + "YOUR AVERAGE RELATIONSHIP BALANCE \n", + "\n", + "Below is a summary of your past three months' relationship balance:\n", + "\n", + "Statement Date \n", + "\n", + ":17 Jun 2023\n", + "\n", + "Page \n", + "\n", + ": 3 of 3\n", + "\n", + "WONG CHI YAN\n", + "\n", + "Month\n", + "\n", + "Average daily relationship balance\n", + "\n", + "March 2023 \n", + "\n", + " 03 \n", + "\n", + "April 2023 \n", + "\n", + " 04 \n", + "\n", + "May 2023 \n", + "\n", + " 05 \n", + "\n", + "HKD \n", + "\n", + "HKD \n", + "\n", + "HKD \n", + "\n", + "115,653.41\n", + "\n", + "137,308.64\n", + "\n", + "151,906.70\n", + "\n", + "Please refer to \"Your Important Statement Information\" on the back of your statement for relationship balance calculation.\n", + "\n", + "For further information on minimum relationship balance requirement, please refer to the Service Charges booklet which can be obtained at any branches or our website at\n", + "sc.com/hk.\n", + "\n", + "sc.com/hk\n", + "\n", + "\fStatement Back Page 月結單背頁\n", + "\n", + "Click here to view the information on the back page of the statement.\n", + "\n", + "請 按 此 參 閱 月 結 單 背 頁 的 資 料 。 \n", + "\n", + "\f\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Got stderr: Jul 06, 2023 12:49:47 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider loadDiskCache\n", + "WARNING: New fonts found, font cache will be re-built\n", + "Jul 06, 2023 12:49:47 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider \n", + "WARNING: Building on-disk font cache, this may take a while\n", + "Jul 06, 2023 12:49:48 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider \n", + "WARNING: Finished building on-disk font cache, found 502 fonts\n", + "Jul 06, 2023 12:49:48 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", + "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", + "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", + "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.rendering.CIDType0Glyph2D getPathForCharacterCode\n", + "WARNING: No glyph for 32 (CID 0000) in font MSungStd-Light\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", + "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.rendering.CIDType0Glyph2D getPathForCharacterCode\n", + "WARNING: No glyph for 32 (CID 0000) in font MSungStd-Light\n", + "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", + "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", + "Jul 06, 2023 12:49:51 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", + "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", + "Jul 06, 2023 12:49:51 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 findFontOrSubstitute\n", + "WARNING: Using fallback font MingLiU for CID-keyed TrueType font MSung-Light\n", + "Jul 06, 2023 12:49:51 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 findFontOrSubstitute\n", + "WARNING: Using fallback font MingLiU for CID-keyed TrueType font MSung-Light\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Account TypeAccount NumberUnnamed: 0Currency BalanceHKD Balance or equivalent
0戶口種類戶口號碼NaN外幣結存結存(以港幣為單位)
1INTEGRATED S/A (Bonus Payroll)368−8−118927−1NaNNaNNaN
2綜合存款戶口—儲蓄(紅利出糧)NaNNaNNaNNaN
3NaNNaNHKD107,220.22107,220.22
4NaNNaNCNY1,044.921,136.87
5NaNNaNUSD100.00778.10
6NaNNaNTOTALNaN109,135.19
7Integrated Account − Current 綜合存款戶口—支票368−8−118927−0NaNNaNNaN
8NaNNaNHKD1,994.681,994.68
9NaNNaNTOTALNaN1,994.68
\n", + "
" + ], + "text/plain": [ + " Account Type Account Number Unnamed: 0 \\\n", + "0 戶口種類 戶口號碼 NaN \n", + "1 INTEGRATED S/A (Bonus Payroll) 368−8−118927−1 NaN \n", + "2 綜合存款戶口—儲蓄(紅利出糧) NaN NaN \n", + "3 NaN NaN HKD \n", + "4 NaN NaN CNY \n", + "5 NaN NaN USD \n", + "6 NaN NaN TOTAL \n", + "7 Integrated Account − Current 綜合存款戶口—支票 368−8−118927−0 NaN \n", + "8 NaN NaN HKD \n", + "9 NaN NaN TOTAL \n", + "\n", + " Currency Balance HKD Balance or equivalent \n", + "0 外幣結存 結存(以港幣為單位) \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 107,220.22 107,220.22 \n", + "4 1,044.92 1,136.87 \n", + "5 100.00 778.10 \n", + "6 NaN 109,135.19 \n", + "7 NaN NaN \n", + "8 1,994.68 1,994.68 \n", + "9 NaN 1,994.68 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Date 日期Unnamed: 0Description 進支詳列Deposit 存款Withdrawal 提款Balance 結餘
0NaNHKDNaNNaNNaNNaN
117 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN115,572.92
220 MayFPS QR MO S** P*** HK2305200062274383NaNNaN285.00115,287.92
325 MayFPS QR HKT − TEL / EYE 26404890000118A46NaNNaN166.00115,121.92
4NaNFPS QR (HKT) NETVIGATO 68112310619222A62NaNNaN416.00114,705.92
5NaNSCB ATM QR WDL0078 1223NaN900.00113,805.92
629 MayWONG K** L*** 00−2305−282310196700NaNNaN12,000.00101,805.92
7NaN(Value Date As of 28 MAY)NaNNaNNaNNaN
831 MayCREDIT INTERESTNINT71.51NaN101,877.43
9NaNCLOUD LIGHT TECHNOLONaN46,031.00NaN147,908.43
1001 JunWONG PAK WING KATHY 00−2306−010004322000NaN12,008.00NaN159,916.43
11NaNTRANSFER WITHDRAWALNTRFNaN12,000.00147,916.43
12NaNINTERNET BANKINGNaNNaNNaNNaN
13NaNTRANSFER WITHDRAWALNTRFNaN12,008.00135,908.43
14NaNINTERNET BANKINGNaNNaNNaNNaN
1505 JunBILL PAY−B−01 5408047948319007NaNNaN29.86135,878.57
16NaNINTERNET BANKINGNaNNaNNaNNaN
17NaNTRANSFER WITHDRAWALNTRFNaN489.06135,389.51
18NaNINTERNET BANKINGNaNNaNNaNNaN
19NaNTRANSFER WITHDRAWALNTRFNaN24,226.11111,163.40
20NaNINTERNET BANKINGNaNNaNNaNNaN
2106 JunYU FUNG SHING 0605PAYC0101085281768NaN275.00NaN111,438.40
22NaN(Value Date As of 05 JUN)NaNNaNNaNNaN
23NaNMAN YUK FAI 0606PAYC0101086133910NaN275.00NaN111,713.40
24NaNPRUDENTIAL HONG KONGNaNNaN793.18110,920.22
2507 JunSCB ATM QR WDL0437 1657NaN900.00110,020.22
2612 JunTRANSFER WITHDRAWALNTRFNaN500.00109,520.22
27NaNINTERNET BANKINGNaNNaNNaNNaN
2813 JunSCB ATM QR WDL0093 1954NaN900.00108,620.22
2917 JunTRANSFER WITHDRAWALNTRFNaN500.00108,120.22
30NaNINTERNET BANKINGNaNNaNNaNNaN
31NaNSCB ATM QR WDL0093 1706NaN900.00107,220.22
3217 JunCLOSING BALANCE 截數結餘NaNNaNNaN107,220.22
33NaNCNYNaNNaNNaNNaN
3417 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN1,044.84
3531 MayCREDIT INTERESTNINT0.08NaN1,044.92
3617 JunCLOSING BALANCE 截數結餘NaNNaNNaN1,044.92
37NaNUSDNaNNaNNaNNaN
3817 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN100.00
3917 JunCLOSING BALANCE 截數結餘NaNNaNNaN100.00
\n", + "
" + ], + "text/plain": [ + " Date 日期 Unnamed: 0 Description 進支詳列 \\\n", + "0 NaN HKD NaN \n", + "1 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", + "2 20 May FPS QR MO S** P*** HK2305200062274383 NaN \n", + "3 25 May FPS QR HKT − TEL / EYE 26404890000118A46 NaN \n", + "4 NaN FPS QR (HKT) NETVIGATO 68112310619222A62 NaN \n", + "5 NaN SCB ATM QR WDL 0078 1223 \n", + "6 29 May WONG K** L*** 00−2305−282310196700 NaN \n", + "7 NaN (Value Date As of 28 MAY) NaN \n", + "8 31 May CREDIT INTEREST NINT \n", + "9 NaN CLOUD LIGHT TECHNOLO NaN \n", + "10 01 Jun WONG PAK WING KATHY 00−2306−010004322000 NaN \n", + "11 NaN TRANSFER WITHDRAWAL NTRF \n", + "12 NaN INTERNET BANKING NaN \n", + "13 NaN TRANSFER WITHDRAWAL NTRF \n", + "14 NaN INTERNET BANKING NaN \n", + "15 05 Jun BILL PAY−B−01 5408047948319007 NaN \n", + "16 NaN INTERNET BANKING NaN \n", + "17 NaN TRANSFER WITHDRAWAL NTRF \n", + "18 NaN INTERNET BANKING NaN \n", + "19 NaN TRANSFER WITHDRAWAL NTRF \n", + "20 NaN INTERNET BANKING NaN \n", + "21 06 Jun YU FUNG SHING 0605PAYC0101085281768 NaN \n", + "22 NaN (Value Date As of 05 JUN) NaN \n", + "23 NaN MAN YUK FAI 0606PAYC0101086133910 NaN \n", + "24 NaN PRUDENTIAL HONG KONG NaN \n", + "25 07 Jun SCB ATM QR WDL 0437 1657 \n", + "26 12 Jun TRANSFER WITHDRAWAL NTRF \n", + "27 NaN INTERNET BANKING NaN \n", + "28 13 Jun SCB ATM QR WDL 0093 1954 \n", + "29 17 Jun TRANSFER WITHDRAWAL NTRF \n", + "30 NaN INTERNET BANKING NaN \n", + "31 NaN SCB ATM QR WDL 0093 1706 \n", + "32 17 Jun CLOSING BALANCE 截數結餘 NaN \n", + "33 NaN CNY NaN \n", + "34 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", + "35 31 May CREDIT INTEREST NINT \n", + "36 17 Jun CLOSING BALANCE 截數結餘 NaN \n", + "37 NaN USD NaN \n", + "38 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", + "39 17 Jun CLOSING BALANCE 截數結餘 NaN \n", + "\n", + " Deposit 存款 Withdrawal 提款 Balance 結餘 \n", + "0 NaN NaN NaN \n", + "1 NaN NaN 115,572.92 \n", + "2 NaN 285.00 115,287.92 \n", + "3 NaN 166.00 115,121.92 \n", + "4 NaN 416.00 114,705.92 \n", + "5 NaN 900.00 113,805.92 \n", + "6 NaN 12,000.00 101,805.92 \n", + "7 NaN NaN NaN \n", + "8 71.51 NaN 101,877.43 \n", + "9 46,031.00 NaN 147,908.43 \n", + "10 12,008.00 NaN 159,916.43 \n", + "11 NaN 12,000.00 147,916.43 \n", + "12 NaN NaN NaN \n", + "13 NaN 12,008.00 135,908.43 \n", + "14 NaN NaN NaN \n", + "15 NaN 29.86 135,878.57 \n", + "16 NaN NaN NaN \n", + "17 NaN 489.06 135,389.51 \n", + "18 NaN NaN NaN \n", + "19 NaN 24,226.11 111,163.40 \n", + "20 NaN NaN NaN \n", + "21 275.00 NaN 111,438.40 \n", + "22 NaN NaN NaN \n", + "23 275.00 NaN 111,713.40 \n", + "24 NaN 793.18 110,920.22 \n", + "25 NaN 900.00 110,020.22 \n", + "26 NaN 500.00 109,520.22 \n", + "27 NaN NaN NaN \n", + "28 NaN 900.00 108,620.22 \n", + "29 NaN 500.00 108,120.22 \n", + "30 NaN NaN NaN \n", + "31 NaN 900.00 107,220.22 \n", + "32 NaN NaN 107,220.22 \n", + "33 NaN NaN NaN \n", + "34 NaN NaN 1,044.84 \n", + "35 0.08 NaN 1,044.92 \n", + "36 NaN NaN 1,044.92 \n", + "37 NaN NaN NaN \n", + "38 NaN NaN 100.00 \n", + "39 NaN NaN 100.00 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Date 日期Unnamed: 0Description 進支詳列Deposit 存款Withdrawal 提款Balance 結餘
0NaNHKDNaNNaNNaNNaN
117 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN1,994.68
217 JunCLOSING BALANCE 截數結餘NaNNaNNaN1,994.68
\n", + "
" + ], + "text/plain": [ + " Date 日期 Unnamed: 0 Description 進支詳列 \\\n", + "0 NaN HKD NaN \n", + "1 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", + "2 17 Jun CLOSING BALANCE 截數結餘 NaN \n", + "\n", + " Deposit 存款 Withdrawal 提款 Balance 結餘 \n", + "0 NaN NaN NaN \n", + "1 NaN NaN 1,994.68 \n", + "2 NaN NaN 1,994.68 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MonthAverage daily relationship balance
0月份每日平均總結餘
1March 2023 年 03 月HKD 港元 115,653.41
2April 2023 年 04 月HKD 港元 137,308.64
3May 2023 年 05 月HKD 港元 151,906.70
\n", + "
" + ], + "text/plain": [ + " Month Average daily relationship balance\n", + "0 月份 每日平均總結餘\n", + "1 March 2023 年 03 月 HKD 港元 115,653.41\n", + "2 April 2023 年 04 月 HKD 港元 137,308.64\n", + "3 May 2023 年 05 月 HKD 港元 151,906.70" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# replace 'sample.pdf' with your actual PDF file path\n", + "pdf_parser = PdfParser(pdf_file_path)\n", + "\n", + "# extract text\n", + "text = pdf_parser.extract_text()\n", + "print(text)\n", + "\n", + "# extract tables\n", + "tables = pdf_parser.extract_table()\n", + "for table in tables:\n", + " display(table)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finance", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "noteable": { + "last_transaction_id": "73ebba71-e2fd-4f44-9b68-dbc9778c8fc8" + }, + "noteable-chatgpt": { + "create_notebook": { + "openai_conversation_id": "179ca6e3-0377-5e6e-8f81-719779d73690", + "openai_ephemeral_user_id": "ace69c27-3d03-5a21-855f-72ec4b037401", + "openai_subdivision1_iso_code": "TW-TPE" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/data_ingestion/pdf_parser.py b/src/data_ingestion/pdf_parser.py index 6295cef..9f1a7ae 100644 --- a/src/data_ingestion/pdf_parser.py +++ b/src/data_ingestion/pdf_parser.py @@ -1,14 +1,35 @@ from pdfminer.high_level import extract_text import tabula -class PdfParser: - def __init__(self, file_path): +class StatementParser: + def __init__(self, file_path: str): self.file_path = file_path def extract_text(self): - text = extract_text(self.file_path) - return text + # Code to extract text from PDF + pass - def extract_table(self): - tables = tabula.read_pdf(self.file_path, pages='all') - return tables + def extract_transactions(self): + # This method should be implemented by each subclass + raise NotImplementedError + + +class DBSCreditCardStatementParser(StatementParser): + def extract_transactions(self): + # Code specific to DBS Credit Card statements + pass + + +class BankSavingAccountStatementParser(StatementParser): + def extract_transactions(self): + # Code specific to Bank Saving Account statements + pass + + + + + +class FundAccountStatementParser(StatementParser): + def extract_transactions(self): + # Code specific to Fund Account statements + pass