chore(release): v0.1.0 – initial public release of TradingAgents

2025-06-05 03:08:28 -07:00
parent 635e91ac75
commit cc97cb6d5d
56 changed files with 5194 additions and 537 deletions
--- a/tradingagents/dataflows/googlenews_utils.py
+++ b/tradingagents/dataflows/googlenews_utils.py
@@ -0,0 +1,108 @@
+import json
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+import time
+import random
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+    retry_if_result,
+)
+
+
+def is_rate_limited(response):
+    """Check if the response indicates rate limiting (status code 429)"""
+    return response.status_code == 429
+
+
+@retry(
+    retry=(retry_if_result(is_rate_limited)),
+    wait=wait_exponential(multiplier=1, min=4, max=60),
+    stop=stop_after_attempt(5),
+)
+def make_request(url, headers):
+    """Make a request with retry logic for rate limiting"""
+    # Random delay before each request to avoid detection
+    time.sleep(random.uniform(2, 6))
+    response = requests.get(url, headers=headers)
+    return response
+
+
+def getNewsData(query, start_date, end_date):
+    """
+    Scrape Google News search results for a given query and date range.
+    query: str - search query
+    start_date: str - start date in the format yyyy-mm-dd or mm/dd/yyyy
+    end_date: str - end date in the format yyyy-mm-dd or mm/dd/yyyy
+    """
+    if "-" in start_date:
+        start_date = datetime.strptime(start_date, "%Y-%m-%d")
+        start_date = start_date.strftime("%m/%d/%Y")
+    if "-" in end_date:
+        end_date = datetime.strptime(end_date, "%Y-%m-%d")
+        end_date = end_date.strftime("%m/%d/%Y")
+
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/101.0.4951.54 Safari/537.36"
+        )
+    }
+
+    news_results = []
+    page = 0
+    while True:
+        offset = page * 10
+        url = (
+            f"https://www.google.com/search?q={query}"
+            f"&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"
+            f"&tbm=nws&start={offset}"
+        )
+
+        try:
+            response = make_request(url, headers)
+            soup = BeautifulSoup(response.content, "html.parser")
+            results_on_page = soup.select("div.SoaBEf")
+
+            if not results_on_page:
+                break  # No more results found
+
+            for el in results_on_page:
+                try:
+                    link = el.find("a")["href"]
+                    title = el.select_one("div.MBeuO").get_text()
+                    snippet = el.select_one(".GI74Re").get_text()
+                    date = el.select_one(".LfVVr").get_text()
+                    source = el.select_one(".NUnG9d span").get_text()
+                    news_results.append(
+                        {
+                            "link": link,
+                            "title": title,
+                            "snippet": snippet,
+                            "date": date,
+                            "source": source,
+                        }
+                    )
+                except Exception as e:
+                    print(f"Error processing result: {e}")
+                    # If one of the fields is not found, skip this result
+                    continue
+
+            # Update the progress bar with the current count of results scraped
+
+            # Check for the "Next" link (pagination)
+            next_link = soup.find("a", id="pnnext")
+            if not next_link:
+                break
+
+            page += 1
+
+        except Exception as e:
+            print(f"Failed after multiple retries: {e}")
+            break
+
+    return news_results