Financial-Data-Scraper/Finance_data_scaper.py at main · jcaperella29/Financial-Data-Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os
import time

# Path to GeckoDriver and Firefox Binary
geckodriver_path = r"C:\Users\ccape\Downloads\geckodriver-v0.35.0-win32\geckodriver.exe"
firefox_binary_path = r"C:\Program Files\Mozilla Firefox\firefox.exe"

# Define the output directory
output_dir = "financial_data"
os.makedirs(output_dir, exist_ok=True)

# Configure Firefox Options
options = Options()
options.binary_location = firefox_binary_path

# Initialize WebDriver for Firefox
service = Service(geckodriver_path)
driver = webdriver.Firefox(service=service, options=options)

# Define the website and tabs to scrape
url = "https://stockanalysis.com/stocks/gm/financials/"
tabs = {
    # Skipping click for Income Statement as it's the default open tab
    "Balance Sheet": "//a[contains(text(), 'Balance Sheet')]",
    "Cash Flow": "//a[contains(text(), 'Cash Flow')]",
    "Ratios": "//a[contains(text(), 'Ratios')]"
}

try:
    # 🌐 Open the Website
    print("🌐 Opening the website...")
    driver.get(url)
    WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
    print("✅ Page loaded successfully.")

    # 📸 Capture Debugging Screenshot
    driver.save_screenshot("page_debug.png")
    print("📸 Saved screenshot as 'page_debug.png'")

    # Handle the Income Statement directly (default tab)
    print("📄 Processing tab: Income Statement (default tab)...")
    try:
        table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//table[@data-test='financials']")))
        print(f"✅ Table found for Income Statement")

        # Extract table rows
        rows = table.find_elements(By.XPATH, ".//tr")
        table_data = []
        for row in rows:
            cells = row.find_elements(By.XPATH, ".//th | .//td")
            table_data.append([cell.text for cell in cells])

        # Convert to DataFrame
        df = pd.DataFrame(table_data[1:], columns=table_data[0])
        print(f"📊 Data extracted for Income Statement:\n", df.head())

        # Save to CSV
        output_file = os.path.join(output_dir, "income_statement.csv")
        df.to_csv(output_file, index=False)
        print(f"💾 Data saved to {output_file}\n")

    except Exception as e:
        print(f"❌ Failed to extract table for Income Statement. Error: {e}")

    # Loop through remaining tabs and extract the table data
    for tab_name, tab_xpath in tabs.items():
        print(f"📄 Navigating to tab: {tab_name}...")

        # Try clicking the tab with different methods
        try:
            # 🔄 Scroll into view
            tab_element = WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, tab_xpath)))
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", tab_element)
            time.sleep(1)  # Small delay to allow rendering

            # 🖱️ Click using Selenium
            WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, tab_xpath))).click()
            print(f"✅ Clicked on {tab_name}")

        except Exception as e:
            print(f"⚠️ Regular click failed for {tab_name}. Trying JavaScript click...")
            try:
                driver.execute_script("arguments[0].click();", tab_element)
                print(f"✅ JavaScript click successful for {tab_name}")
            except Exception as js_error:
                print(f"❌ JavaScript click failed for {tab_name}. Skipping tab. Error: {js_error}")
                continue  # Skip to the next tab

        time.sleep(2)  # Allow time for the table to load

        # Locate the financial table
        try:
            table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//table[@data-test='financials']")))
            print(f"✅ Table found for {tab_name}")

            # Extract table rows
            rows = table.find_elements(By.XPATH, ".//tr")
            table_data = []
            for row in rows:
                cells = row.find_elements(By.XPATH, ".//th | .//td")
                table_data.append([cell.text for cell in cells])

            # Convert to DataFrame
            df = pd.DataFrame(table_data[1:], columns=table_data[0])
            print(f"📊 Data extracted for {tab_name}:\n", df.head())

            # Save to CSV
            output_file = os.path.join(output_dir, f"{tab_name.replace(' ', '_').lower()}.csv")
            df.to_csv(output_file, index=False)
            print(f"💾 Data saved to {output_file}\n")

        except Exception as e:
            print(f"❌ Failed to extract table for {tab_name}. Error: {e}")
            continue  # Skip to the next tab

except Exception as e:
    print(f"❌ An error occurred: {e}")

finally:
    driver.quit()
    print("🚪 Browser closed.")