diff --git a/notebooks/pdf_data_extraction_test.ipynb b/notebooks/pdf_data_extraction_test.ipynb index 80634df..65eb268 100644 --- a/notebooks/pdf_data_extraction_test.ipynb +++ b/notebooks/pdf_data_extraction_test.ipynb @@ -47,1432 +47,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "76db2644", "metadata": {}, "outputs": [], "source": [ - "pdf_file_path = r\"..\\data\\raw\\chris' statements\\savings\\sc\\sc_savings_eStatement_202306.pdf\"" + "# pdf_file_path = r\"..\\data\\raw\\chris' statements\\savings\\sc\\sc_savings_eStatement_202306.pdf\"\n", + "pdf_file_path = r\"..\\data\\raw\\chris' statements\\credit cards\\dbs\\dbs_card_9007_eStatement_202211.pdf\"" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "e91a1abd-de76-4a12-adc5-fec5fd1e4301", "metadata": { "noteable": { "cell_type": "code" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Consolidated Statement\n", - "\n", - "Statement Date \n", - "\n", - ": 17 Jun 2023\n", - "\n", - "Page \n", - "\n", - ": 1 of 3\n", - "\n", - "WONG CHI YAN \n", - "UNIT B 3/F BLK 1\n", - "REGENTVILLE\n", - "8 WO MUN ST\n", - "FANLING NT\n", - "HONG KONG\n", - "\n", - "YOUR FINANCIAL STATUS \n", - "\n", - "AS AT STATEMENT DATE (IN HKD EQUIVALENT) \n", - "\n", - "1. DEPOSITS \n", - "\n", - "HKD Deposits \n", - "CNY Deposits \n", - "FX Deposits \n", - "\n", - "2. INVESTMENTS \n", - "\n", - "Securities \n", - "Investment Funds \n", - "Gold \n", - "Currency Trading \n", - "Debt Securities \n", - "Equity Linked Investment \n", - "\n", - "3. PERSONAL LOANS \n", - "\n", - "Instalment Loans \n", - "Overdrafts \n", - "Credit Cards \n", - "4. WEALTHPRO \n", - "\n", - "207 \n", - "\n", - "5. INVESTPOWER \n", - "\n", - "109,214.90\n", - "1,136.87\n", - "778.10\n", - "\n", - "0.00\n", - "0.00\n", - "0.00\n", - "0.00\n", - "0.00 \n", - "0.00 \n", - "\n", - "0.00\n", - "0.00\n", - "0.00\n", - "\n", - "111,129.87\n", - "\n", - "0.00\n", - "\n", - "0.00 \n", - "\n", - "0.00 \n", - "\n", - "0.00 \n", - "\n", - "PREMIUM PROGRAMME \n", - "\n", - "6. NET POSITION \n", - "\n", - " (1+2−3+4+5)\n", - "\n", - "111,129.87 \n", - "\n", - "7. INSURANCE \n", - "\n", - "8. MORTGAGES \n", - "\n", - "9. MPF/ORSO \n", - "\n", - "0.00 \n", - "\n", - "0.00 \n", - "\n", - "0.00 \n", - "\n", - "YOUR ACCOUNT BALANCES \n", - "\n", - "Account Type\n", - "\n", - "Account Number\n", - "\n", - "Currency Balance\n", - "\n", - "HKD Balance or equivalent\n", - "\n", - "INTEGRATED S/A (Bonus Payroll)\n", - "\n", - "368−8−118927−1\n", - "\n", - "Integrated Account − Current \n", - "\n", - "368−8−118927−0\n", - "\n", - "HKD \n", - "\n", - "CNY \n", - "\n", - "USD \n", - "\n", - "TOTAL\n", - "\n", - "HKD \n", - "\n", - "TOTAL\n", - "\n", - "107,220.22 \n", - "\n", - "1,044.92 \n", - "\n", - "100.00 \n", - "\n", - "1,994.68 \n", - "\n", - "107,220.22 \n", - "\n", - "1,136.87 \n", - "\n", - "778.10 \n", - "\n", - "109,135.19 \n", - "\n", - "1,994.68 \n", - "\n", - "1,994.68 \n", - "\n", - "With immediate effect, Standard Chartered Bank (Hong Kong) Limited will need customer consent in order to access and utilize their deposit−related information for providing investment /\n", - "wealth management services to them. If you would like to make such arrangement, please visit any of our branches to arrange.\n", - "\n", - "With effect from August 2018, the Relationship Balance of your personal account will include the MPF account balance under Manulife Global Select (MPF) Scheme where you authorise and\n", - "consent the Bank to receive your MPF account information. To obtain the request form, please visit sc.com/hk/help/download−centre/ after 6 August 2018.\n", - "\n", - "2018\n", - "\n", - "8\n", - "\n", - "8\n", - "\n", - "6\n", - "\n", - "sc.com/hk/help/download−centre/\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\fConsolidated Statement \n", - "\n", - "PREMIUM PROGRAMME\n", - "\n", - "YOUR ACCOUNT ACTIVITIES \n", - "\n", - "INTEGRATED S/A (BONUS PAYROLL) \n", - "WONG CHI YAN\n", - "\n", - "Statement Date \n", - "\n", - ":17 Jun 2023\n", - "\n", - "Page \n", - "\n", - ": 2 of 3\n", - "\n", - "WONG CHI YAN\n", - "\n", - ": 368−8−118927−1\n", - "\n", - "Date \n", - "\n", - "Description \n", - "\n", - "Deposit \n", - "\n", - "Withdrawal \n", - "\n", - "Balance\n", - "\n", - "HKD \n", - "\n", - "17 May\n", - "\n", - "BALANCE FROM PREVIOUS STATEMENT\n", - "\n", - "20 May\n", - "\n", - "25 May\n", - "\n", - "FPS QR MO S** P*** HK2305200062274383\n", - "\n", - "FPS QR HKT − TEL / EYE 26404890000118A46\n", - "\n", - "FPS QR (HKT) NETVIGATO 68112310619222A62\n", - "\n", - "SCB ATM QR WDL 0078 1223\n", - "\n", - "29 May\n", - "\n", - "WONG K** L*** 00−2305−282310196700\n", - "(Value Date As of 28 MAY)\n", - "\n", - "31 May\n", - "\n", - "CREDIT INTEREST NINT\n", - "\n", - "CLOUD LIGHT TECHNOLO\n", - "\n", - "01 Jun\n", - "\n", - "WONG PAK WING KATHY 00−2306−010004322000\n", - "\n", - "TRANSFER WITHDRAWAL NTRF\n", - "INTERNET BANKING\n", - "\n", - "TRANSFER WITHDRAWAL NTRF\n", - "INTERNET BANKING\n", - "\n", - "05 Jun\n", - "\n", - "BILL PAY−B−01 5408047948319007\n", - "INTERNET BANKING\n", - "\n", - "TRANSFER WITHDRAWAL NTRF\n", - "INTERNET BANKING\n", - "\n", - "TRANSFER WITHDRAWAL NTRF\n", - "INTERNET BANKING\n", - "\n", - "06 Jun\n", - "\n", - "YU FUNG SHING 0605PAYC0101085281768\n", - "(Value Date As of 05 JUN)\n", - "\n", - "07 Jun\n", - "\n", - "12 Jun\n", - "\n", - "13 Jun\n", - "\n", - "17 Jun\n", - "\n", - "MAN YUK FAI 0606PAYC0101086133910\n", - "\n", - "PRUDENTIAL HONG KONG\n", - "\n", - "SCB ATM QR WDL 0437 1657\n", - "\n", - "TRANSFER WITHDRAWAL NTRF\n", - "INTERNET BANKING\n", - "\n", - "SCB ATM QR WDL 0093 1954\n", - "\n", - "TRANSFER WITHDRAWAL NTRF\n", - "INTERNET BANKING\n", - "\n", - "SCB ATM QR WDL 0093 1706\n", - "\n", - "17 Jun\n", - "\n", - "CLOSING BALANCE\n", - "\n", - "CNY \n", - "\n", - "17 May\n", - "\n", - "BALANCE FROM PREVIOUS STATEMENT\n", - "\n", - "31 May\n", - "\n", - "17 Jun\n", - "\n", - "CREDIT INTEREST NINT\n", - "\n", - "CLOSING BALANCE\n", - "\n", - "USD \n", - "\n", - "17 May\n", - "\n", - "BALANCE FROM PREVIOUS STATEMENT\n", - "\n", - "17 Jun\n", - "\n", - "CLOSING BALANCE\n", - "\n", - "71.51\n", - "\n", - "46,031.00\n", - "\n", - "12,008.00\n", - "\n", - "275.00\n", - "\n", - "275.00\n", - "\n", - "0.08\n", - "\n", - "285.00\n", - "\n", - "166.00\n", - "\n", - "416.00\n", - "\n", - "900.00\n", - "\n", - "12,000.00\n", - "\n", - "12,000.00\n", - "\n", - "115,572.92\n", - "\n", - "115,287.92\n", - "\n", - "115,121.92\n", - "\n", - "114,705.92\n", - "\n", - "113,805.92\n", - "\n", - "101,805.92\n", - "\n", - "101,877.43\n", - "\n", - "147,908.43\n", - "\n", - "159,916.43\n", - "\n", - "147,916.43\n", - "\n", - "12,008.00\n", - "\n", - "135,908.43\n", - "\n", - "29.86\n", - "\n", - "489.06\n", - "\n", - "135,878.57\n", - "\n", - "135,389.51\n", - "\n", - "24,226.11\n", - "\n", - "111,163.40\n", - "\n", - "793.18\n", - "\n", - "900.00\n", - "\n", - "500.00\n", - "\n", - "900.00\n", - "\n", - "500.00\n", - "\n", - "900.00\n", - "\n", - "111,438.40\n", - "\n", - "111,713.40\n", - "\n", - "110,920.22\n", - "\n", - "110,020.22\n", - "\n", - "109,520.22\n", - "\n", - "108,620.22\n", - "\n", - "108,120.22\n", - "\n", - "107,220.22\n", - "\n", - "107,220.22\n", - "\n", - "1,044.84\n", - "\n", - "1,044.92\n", - "\n", - "1,044.92\n", - "\n", - "100.00\n", - "\n", - "100.00\n", - "\n", - "1,994.68\n", - "\n", - "1,994.68\n", - "\n", - "INTEGRATED ACCOUNT − CURRENT \n", - "WONG CHI YAN\n", - "\n", - ": 368−8−118927−0\n", - "\n", - "Date \n", - "\n", - "Description \n", - "\n", - "Deposit \n", - "\n", - "Withdrawal \n", - "\n", - "Balance\n", - "\n", - "HKD \n", - "\n", - "17 May\n", - "\n", - "BALANCE FROM PREVIOUS STATEMENT\n", - "\n", - "17 Jun\n", - "\n", - "CLOSING BALANCE\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\fConsolidated Statement \n", - "\n", - "PREMIUM PROGRAMME\n", - "\n", - "YOUR AVERAGE RELATIONSHIP BALANCE \n", - "\n", - "Below is a summary of your past three months' relationship balance:\n", - "\n", - "Statement Date \n", - "\n", - ":17 Jun 2023\n", - "\n", - "Page \n", - "\n", - ": 3 of 3\n", - "\n", - "WONG CHI YAN\n", - "\n", - "Month\n", - "\n", - "Average daily relationship balance\n", - "\n", - "March 2023 \n", - "\n", - " 03 \n", - "\n", - "April 2023 \n", - "\n", - " 04 \n", - "\n", - "May 2023 \n", - "\n", - " 05 \n", - "\n", - "HKD \n", - "\n", - "HKD \n", - "\n", - "HKD \n", - "\n", - "115,653.41\n", - "\n", - "137,308.64\n", - "\n", - "151,906.70\n", - "\n", - "Please refer to \"Your Important Statement Information\" on the back of your statement for relationship balance calculation.\n", - "\n", - "For further information on minimum relationship balance requirement, please refer to the Service Charges booklet which can be obtained at any branches or our website at\n", - "sc.com/hk.\n", - "\n", - "sc.com/hk\n", - "\n", - "\fStatement Back Page 月結單背頁\n", - "\n", - "Click here to view the information on the back page of the statement.\n", - "\n", - "請 按 此 參 閱 月 結 單 背 頁 的 資 料 。 \n", - "\n", - "\f\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Got stderr: Jul 06, 2023 12:49:47 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider loadDiskCache\n", - "WARNING: New fonts found, font cache will be re-built\n", - "Jul 06, 2023 12:49:47 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider \n", - "WARNING: Building on-disk font cache, this may take a while\n", - "Jul 06, 2023 12:49:48 AM org.apache.pdfbox.pdmodel.font.FileSystemFontProvider \n", - "WARNING: Finished building on-disk font cache, found 502 fonts\n", - "Jul 06, 2023 12:49:48 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", - "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", - "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", - "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.rendering.CIDType0Glyph2D getPathForCharacterCode\n", - "WARNING: No glyph for 32 (CID 0000) in font MSungStd-Light\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", - "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:50 AM org.apache.pdfbox.rendering.CIDType0Glyph2D getPathForCharacterCode\n", - "WARNING: No glyph for 32 (CID 0000) in font MSungStd-Light\n", - "Jul 06, 2023 12:49:50 AM org.apache.fontbox.cmap.CMap readCode\n", - "WARNING: Invalid character code sequence 0x00 (0000) 0x20 (0040) in CMap ETen-HOST-H\n", - "Jul 06, 2023 12:49:51 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType0 \n", - "WARNING: Using fallback DFKaiShu-SB-Estd-BF for CID-keyed font MSungStd-Light\n", - "Jul 06, 2023 12:49:51 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 findFontOrSubstitute\n", - "WARNING: Using fallback font MingLiU for CID-keyed TrueType font MSung-Light\n", - "Jul 06, 2023 12:49:51 AM org.apache.pdfbox.pdmodel.font.PDCIDFontType2 findFontOrSubstitute\n", - "WARNING: Using fallback font MingLiU for CID-keyed TrueType font MSung-Light\n", - "\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Account TypeAccount NumberUnnamed: 0Currency BalanceHKD Balance or equivalent
0戶口種類戶口號碼NaN外幣結存結存(以港幣為單位)
1INTEGRATED S/A (Bonus Payroll)368−8−118927−1NaNNaNNaN
2綜合存款戶口—儲蓄(紅利出糧)NaNNaNNaNNaN
3NaNNaNHKD107,220.22107,220.22
4NaNNaNCNY1,044.921,136.87
5NaNNaNUSD100.00778.10
6NaNNaNTOTALNaN109,135.19
7Integrated Account − Current 綜合存款戶口—支票368−8−118927−0NaNNaNNaN
8NaNNaNHKD1,994.681,994.68
9NaNNaNTOTALNaN1,994.68
\n", - "
" - ], - "text/plain": [ - " Account Type Account Number Unnamed: 0 \\\n", - "0 戶口種類 戶口號碼 NaN \n", - "1 INTEGRATED S/A (Bonus Payroll) 368−8−118927−1 NaN \n", - "2 綜合存款戶口—儲蓄(紅利出糧) NaN NaN \n", - "3 NaN NaN HKD \n", - "4 NaN NaN CNY \n", - "5 NaN NaN USD \n", - "6 NaN NaN TOTAL \n", - "7 Integrated Account − Current 綜合存款戶口—支票 368−8−118927−0 NaN \n", - "8 NaN NaN HKD \n", - "9 NaN NaN TOTAL \n", - "\n", - " Currency Balance HKD Balance or equivalent \n", - "0 外幣結存 結存(以港幣為單位) \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 107,220.22 107,220.22 \n", - "4 1,044.92 1,136.87 \n", - "5 100.00 778.10 \n", - "6 NaN 109,135.19 \n", - "7 NaN NaN \n", - "8 1,994.68 1,994.68 \n", - "9 NaN 1,994.68 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Date 日期Unnamed: 0Description 進支詳列Deposit 存款Withdrawal 提款Balance 結餘
0NaNHKDNaNNaNNaNNaN
117 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN115,572.92
220 MayFPS QR MO S** P*** HK2305200062274383NaNNaN285.00115,287.92
325 MayFPS QR HKT − TEL / EYE 26404890000118A46NaNNaN166.00115,121.92
4NaNFPS QR (HKT) NETVIGATO 68112310619222A62NaNNaN416.00114,705.92
5NaNSCB ATM QR WDL0078 1223NaN900.00113,805.92
629 MayWONG K** L*** 00−2305−282310196700NaNNaN12,000.00101,805.92
7NaN(Value Date As of 28 MAY)NaNNaNNaNNaN
831 MayCREDIT INTERESTNINT71.51NaN101,877.43
9NaNCLOUD LIGHT TECHNOLONaN46,031.00NaN147,908.43
1001 JunWONG PAK WING KATHY 00−2306−010004322000NaN12,008.00NaN159,916.43
11NaNTRANSFER WITHDRAWALNTRFNaN12,000.00147,916.43
12NaNINTERNET BANKINGNaNNaNNaNNaN
13NaNTRANSFER WITHDRAWALNTRFNaN12,008.00135,908.43
14NaNINTERNET BANKINGNaNNaNNaNNaN
1505 JunBILL PAY−B−01 5408047948319007NaNNaN29.86135,878.57
16NaNINTERNET BANKINGNaNNaNNaNNaN
17NaNTRANSFER WITHDRAWALNTRFNaN489.06135,389.51
18NaNINTERNET BANKINGNaNNaNNaNNaN
19NaNTRANSFER WITHDRAWALNTRFNaN24,226.11111,163.40
20NaNINTERNET BANKINGNaNNaNNaNNaN
2106 JunYU FUNG SHING 0605PAYC0101085281768NaN275.00NaN111,438.40
22NaN(Value Date As of 05 JUN)NaNNaNNaNNaN
23NaNMAN YUK FAI 0606PAYC0101086133910NaN275.00NaN111,713.40
24NaNPRUDENTIAL HONG KONGNaNNaN793.18110,920.22
2507 JunSCB ATM QR WDL0437 1657NaN900.00110,020.22
2612 JunTRANSFER WITHDRAWALNTRFNaN500.00109,520.22
27NaNINTERNET BANKINGNaNNaNNaNNaN
2813 JunSCB ATM QR WDL0093 1954NaN900.00108,620.22
2917 JunTRANSFER WITHDRAWALNTRFNaN500.00108,120.22
30NaNINTERNET BANKINGNaNNaNNaNNaN
31NaNSCB ATM QR WDL0093 1706NaN900.00107,220.22
3217 JunCLOSING BALANCE 截數結餘NaNNaNNaN107,220.22
33NaNCNYNaNNaNNaNNaN
3417 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN1,044.84
3531 MayCREDIT INTERESTNINT0.08NaN1,044.92
3617 JunCLOSING BALANCE 截數結餘NaNNaNNaN1,044.92
37NaNUSDNaNNaNNaNNaN
3817 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN100.00
3917 JunCLOSING BALANCE 截數結餘NaNNaNNaN100.00
\n", - "
" - ], - "text/plain": [ - " Date 日期 Unnamed: 0 Description 進支詳列 \\\n", - "0 NaN HKD NaN \n", - "1 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", - "2 20 May FPS QR MO S** P*** HK2305200062274383 NaN \n", - "3 25 May FPS QR HKT − TEL / EYE 26404890000118A46 NaN \n", - "4 NaN FPS QR (HKT) NETVIGATO 68112310619222A62 NaN \n", - "5 NaN SCB ATM QR WDL 0078 1223 \n", - "6 29 May WONG K** L*** 00−2305−282310196700 NaN \n", - "7 NaN (Value Date As of 28 MAY) NaN \n", - "8 31 May CREDIT INTEREST NINT \n", - "9 NaN CLOUD LIGHT TECHNOLO NaN \n", - "10 01 Jun WONG PAK WING KATHY 00−2306−010004322000 NaN \n", - "11 NaN TRANSFER WITHDRAWAL NTRF \n", - "12 NaN INTERNET BANKING NaN \n", - "13 NaN TRANSFER WITHDRAWAL NTRF \n", - "14 NaN INTERNET BANKING NaN \n", - "15 05 Jun BILL PAY−B−01 5408047948319007 NaN \n", - "16 NaN INTERNET BANKING NaN \n", - "17 NaN TRANSFER WITHDRAWAL NTRF \n", - "18 NaN INTERNET BANKING NaN \n", - "19 NaN TRANSFER WITHDRAWAL NTRF \n", - "20 NaN INTERNET BANKING NaN \n", - "21 06 Jun YU FUNG SHING 0605PAYC0101085281768 NaN \n", - "22 NaN (Value Date As of 05 JUN) NaN \n", - "23 NaN MAN YUK FAI 0606PAYC0101086133910 NaN \n", - "24 NaN PRUDENTIAL HONG KONG NaN \n", - "25 07 Jun SCB ATM QR WDL 0437 1657 \n", - "26 12 Jun TRANSFER WITHDRAWAL NTRF \n", - "27 NaN INTERNET BANKING NaN \n", - "28 13 Jun SCB ATM QR WDL 0093 1954 \n", - "29 17 Jun TRANSFER WITHDRAWAL NTRF \n", - "30 NaN INTERNET BANKING NaN \n", - "31 NaN SCB ATM QR WDL 0093 1706 \n", - "32 17 Jun CLOSING BALANCE 截數結餘 NaN \n", - "33 NaN CNY NaN \n", - "34 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", - "35 31 May CREDIT INTEREST NINT \n", - "36 17 Jun CLOSING BALANCE 截數結餘 NaN \n", - "37 NaN USD NaN \n", - "38 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", - "39 17 Jun CLOSING BALANCE 截數結餘 NaN \n", - "\n", - " Deposit 存款 Withdrawal 提款 Balance 結餘 \n", - "0 NaN NaN NaN \n", - "1 NaN NaN 115,572.92 \n", - "2 NaN 285.00 115,287.92 \n", - "3 NaN 166.00 115,121.92 \n", - "4 NaN 416.00 114,705.92 \n", - "5 NaN 900.00 113,805.92 \n", - "6 NaN 12,000.00 101,805.92 \n", - "7 NaN NaN NaN \n", - "8 71.51 NaN 101,877.43 \n", - "9 46,031.00 NaN 147,908.43 \n", - "10 12,008.00 NaN 159,916.43 \n", - "11 NaN 12,000.00 147,916.43 \n", - "12 NaN NaN NaN \n", - "13 NaN 12,008.00 135,908.43 \n", - "14 NaN NaN NaN \n", - "15 NaN 29.86 135,878.57 \n", - "16 NaN NaN NaN \n", - "17 NaN 489.06 135,389.51 \n", - "18 NaN NaN NaN \n", - "19 NaN 24,226.11 111,163.40 \n", - "20 NaN NaN NaN \n", - "21 275.00 NaN 111,438.40 \n", - "22 NaN NaN NaN \n", - "23 275.00 NaN 111,713.40 \n", - "24 NaN 793.18 110,920.22 \n", - "25 NaN 900.00 110,020.22 \n", - "26 NaN 500.00 109,520.22 \n", - "27 NaN NaN NaN \n", - "28 NaN 900.00 108,620.22 \n", - "29 NaN 500.00 108,120.22 \n", - "30 NaN NaN NaN \n", - "31 NaN 900.00 107,220.22 \n", - "32 NaN NaN 107,220.22 \n", - "33 NaN NaN NaN \n", - "34 NaN NaN 1,044.84 \n", - "35 0.08 NaN 1,044.92 \n", - "36 NaN NaN 1,044.92 \n", - "37 NaN NaN NaN \n", - "38 NaN NaN 100.00 \n", - "39 NaN NaN 100.00 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Date 日期Unnamed: 0Description 進支詳列Deposit 存款Withdrawal 提款Balance 結餘
0NaNHKDNaNNaNNaNNaN
117 MayBALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘NaNNaNNaN1,994.68
217 JunCLOSING BALANCE 截數結餘NaNNaNNaN1,994.68
\n", - "
" - ], - "text/plain": [ - " Date 日期 Unnamed: 0 Description 進支詳列 \\\n", - "0 NaN HKD NaN \n", - "1 17 May BALANCE FROM PREVIOUS STATEMENT 戶口之進支結餘 NaN \n", - "2 17 Jun CLOSING BALANCE 截數結餘 NaN \n", - "\n", - " Deposit 存款 Withdrawal 提款 Balance 結餘 \n", - "0 NaN NaN NaN \n", - "1 NaN NaN 1,994.68 \n", - "2 NaN NaN 1,994.68 " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MonthAverage daily relationship balance
0月份每日平均總結餘
1March 2023 年 03 月HKD 港元 115,653.41
2April 2023 年 04 月HKD 港元 137,308.64
3May 2023 年 05 月HKD 港元 151,906.70
\n", - "
" - ], - "text/plain": [ - " Month Average daily relationship balance\n", - "0 月份 每日平均總結餘\n", - "1 March 2023 年 03 月 HKD 港元 115,653.41\n", - "2 April 2023 年 04 月 HKD 港元 137,308.64\n", - "3 May 2023 年 05 月 HKD 港元 151,906.70" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# replace 'sample.pdf' with your actual PDF file path\n", "pdf_parser = PdfParser(pdf_file_path)\n", @@ -1486,6 +79,16 @@ "for table in tables:\n", " display(table)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60900e46", + "metadata": {}, + "outputs": [], + "source": [ + "len(tables)" + ] } ], "metadata": { diff --git a/src/data_ingestion/pdf_parser.py b/src/data_ingestion/pdf_parser.py index 9f1a7ae..145cdc2 100644 --- a/src/data_ingestion/pdf_parser.py +++ b/src/data_ingestion/pdf_parser.py @@ -1,13 +1,24 @@ -from pdfminer.high_level import extract_text +import re +from datetime import datetime +import pdfplumber +import pandas as pd + +# from pdfminer.high_level import extract_text import tabula + class StatementParser: - def __init__(self, file_path: str): + def __init__(self, file_path): self.file_path = file_path def extract_text(self): - # Code to extract text from PDF - pass + with pdfplumber.open(self.file_path) as pdf: + full_text = "\n".join(page.extract_text() for page in pdf.pages) + return full_text + + def extract_table(self): + tables = tabula.read_pdf(self.file_path, pages="all") + return tables def extract_transactions(self): # This method should be implemented by each subclass @@ -15,9 +26,72 @@ class StatementParser: class DBSCreditCardStatementParser(StatementParser): + def extract_card_last_4_digits(self): + full_text = self.extract_text() + card_number_pattern = r"(\d{4}-\d{4}-\d{4}-\d{4})" + card_number_match = re.search(card_number_pattern, full_text) + if card_number_match: + card_number = card_number_match.group() + last_4_digits = card_number.split("-")[-1] + else: + raise ValueError("Card number not found in the statement.") + return last_4_digits + + def extract_previous_balance(self): + full_text = self.extract_text() + prev_balance_pattern = r"PREVIOUS BALANCE ([0-9,]+\.\d{2})( CR)?" + prev_balance_match = re.search(prev_balance_pattern, full_text) + if prev_balance_match: + prev_balance = self.clean_amount(prev_balance_match.group(1)) + else: + raise ValueError("Previous balance not found in the statement.") + return prev_balance + def extract_transactions(self): - # Code specific to DBS Credit Card statements - pass + full_text = self.extract_text() + pattern = r"(\d{2} \w{3})\s(\d{2} \w{3})\s(.*?)\s([0-9,]+\.\d{2}(?: CR)?)" + matches = re.findall(pattern, full_text, re.DOTALL) + df = pd.DataFrame( + matches, columns=["TRANS DATE", "POST DATE", "DESCRIPTION", "AMOUNT HKD"] + ) + df["TRANS DATE"] = df["TRANS DATE"].apply(lambda x: self.add_year_to_date(x)) + df["POST DATE"] = df["POST DATE"].apply(lambda x: self.add_year_to_date(x)) + df["AMOUNT HKD"] = df["AMOUNT HKD"].apply(self.clean_amount) + df.insert(0, "Card Last 4 Digits", self.extract_card_last_4_digits()) + df["RUNNING BALANCE"] = ( + df["AMOUNT HKD"].cumsum() + self.extract_previous_balance() + ) + return df + + def clean_amount(self, amount): + if "CR" in amount: + amount = -float(amount.replace(",", "").replace("CR", "")) + else: + amount = float(amount.replace(",", "")) + return amount + + def extract_statement_date(self): + full_text = self.extract_text() + statement_date_pattern = r"STATEMENT DATE (\d{2} \w{3} \d{4})" + statement_date_match = re.search(statement_date_pattern, full_text) + if statement_date_match: + statement_date = datetime.strptime( + statement_date_match.group(1), "%d %b %Y" + ) + else: + raise ValueError("Statement date not found in the statement.") + return statement_date + + def add_year_to_date(self, date): + statement_date = self.extract_statement_date() + date_with_year = datetime.strptime( + date + " " + str(statement_date.year), "%d %b %Y" + ) + if date_with_year > statement_date: + date_with_year = datetime.strptime( + date + " " + str(statement_date.year - 1), "%d %b %Y" + ) + return date_with_year class BankSavingAccountStatementParser(StatementParser): @@ -26,9 +100,6 @@ class BankSavingAccountStatementParser(StatementParser): pass - - - class FundAccountStatementParser(StatementParser): def extract_transactions(self): # Code specific to Fund Account statements diff --git a/src/data_ingestion/pdf_sc.py b/src/data_ingestion/pdf_sc.py new file mode 100644 index 0000000..a29e5c5 --- /dev/null +++ b/src/data_ingestion/pdf_sc.py @@ -0,0 +1,46 @@ +# %% + +import tabula +import pandas as pd + +# Path to the PDF file +file_path = r"Z:\chris\projects\family_finance\personal-finance-database\data\raw\chris' statements\credit cards\sc\sc_credit_card_eStatement_202303.pdf" + +# Use tabula to read the tables from the PDF +tables = tabula.read_pdf(file_path, pages='all', lattice=True) + +print(len(tables)) +# %% +# 'tables' is a list of DataFrames, one for each table found in the PDF. +# You can access individual tables like this: +# df1 = tables[0] # This is the first table +# df2 = tables[1] # This is the second table +# df3 = tables[2] +# df4 = tables[3] +# From here, you can clean up and process the data in each DataFrame as needed. +# For example, you might need to rename columns, convert data types, handle missing values, etc. +tables = tabula.read_pdf(file_path, pages='all', stream=True) + + +for idx, table in enumerate(tables): + + print(idx) + print(table) +# %% +df = tables[4] +# %% +# Replace the "\r" characters with a unique delimiter +df.replace(to_replace='\r', value='|', regex=True, inplace=True) +# %% +# Split the cells into separate rows +# Split the cells into separate rows +df['Date\r日期'] = df['Date\r日期'].str.split('|') +df = df.explode('Date\r日期') + +# %% +df +# %% +# tables = tabula.read_pdf(file_path, pages='all', stream=True) +tables = tabula.read_pdf(file_path, pages='all', split_text=True) + +# %% diff --git a/tests/test_data_ingestion.py b/tests/test_data_ingestion.py index 377fbd6..c9c69ce 100644 --- a/tests/test_data_ingestion.py +++ b/tests/test_data_ingestion.py @@ -1,15 +1,17 @@ import pytest from pdf_parser import PdfParser + def test_pdf_parser_text_extraction(): - pdf_parser = PdfParser('path_to_test_pdf') + pdf_parser = PdfParser("path_to_test_pdf") text = pdf_parser.extract_text() assert isinstance(text, str) assert len(text) > 0 + def test_pdf_parser_table_extraction(): - pdf_parser = PdfParser('path_to_test_pdf') + pdf_parser = PdfParser("path_to_test_pdf") tables = pdf_parser.extract_table() assert isinstance(tables, list) diff --git a/tests/test_pdf_parser.py b/tests/test_pdf_parser.py new file mode 100644 index 0000000..4342e37 --- /dev/null +++ b/tests/test_pdf_parser.py @@ -0,0 +1,28 @@ +import pandas as pd +import pytest +from pdf_parser import DBSCreditCardStatementParser + + +def test_dbs_credit_card_statement_parser(): + # The path to a sample PDF file to use for testing + sample_pdf_path = "tests/sample_dbs_statement.pdf" + + # Initialize the parser + parser = DBSCreditCardStatementParser(sample_pdf_path) + + # Parse the PDF file + df = parser.parse() + + # Check the DataFrame's columns + assert list(df.columns) == [ + "Card Last 4 Digits", + "TRANS DATE", + "POST DATE", + "DESCRIPTION", + "AMOUNT HKD", + "RUNNING BALANCE", + ] + + # Check the DataFrame's number of rows + # (Replace 21 with the actual number of transactions in the sample PDF file) + assert len(df) == 21