chore(release): v0.1.0 – initial public release of TradingAgents
This commit is contained in:
108
tradingagents/dataflows/googlenews_utils.py
Normal file
108
tradingagents/dataflows/googlenews_utils.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import json
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
import time
|
||||
import random
|
||||
from tenacity import (
|
||||
retry,
|
||||
stop_after_attempt,
|
||||
wait_exponential,
|
||||
retry_if_exception_type,
|
||||
retry_if_result,
|
||||
)
|
||||
|
||||
|
||||
def is_rate_limited(response):
|
||||
"""Check if the response indicates rate limiting (status code 429)"""
|
||||
return response.status_code == 429
|
||||
|
||||
|
||||
@retry(
|
||||
retry=(retry_if_result(is_rate_limited)),
|
||||
wait=wait_exponential(multiplier=1, min=4, max=60),
|
||||
stop=stop_after_attempt(5),
|
||||
)
|
||||
def make_request(url, headers):
|
||||
"""Make a request with retry logic for rate limiting"""
|
||||
# Random delay before each request to avoid detection
|
||||
time.sleep(random.uniform(2, 6))
|
||||
response = requests.get(url, headers=headers)
|
||||
return response
|
||||
|
||||
|
||||
def getNewsData(query, start_date, end_date):
|
||||
"""
|
||||
Scrape Google News search results for a given query and date range.
|
||||
query: str - search query
|
||||
start_date: str - start date in the format yyyy-mm-dd or mm/dd/yyyy
|
||||
end_date: str - end date in the format yyyy-mm-dd or mm/dd/yyyy
|
||||
"""
|
||||
if "-" in start_date:
|
||||
start_date = datetime.strptime(start_date, "%Y-%m-%d")
|
||||
start_date = start_date.strftime("%m/%d/%Y")
|
||||
if "-" in end_date:
|
||||
end_date = datetime.strptime(end_date, "%Y-%m-%d")
|
||||
end_date = end_date.strftime("%m/%d/%Y")
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/101.0.4951.54 Safari/537.36"
|
||||
)
|
||||
}
|
||||
|
||||
news_results = []
|
||||
page = 0
|
||||
while True:
|
||||
offset = page * 10
|
||||
url = (
|
||||
f"https://www.google.com/search?q={query}"
|
||||
f"&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}"
|
||||
f"&tbm=nws&start={offset}"
|
||||
)
|
||||
|
||||
try:
|
||||
response = make_request(url, headers)
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
results_on_page = soup.select("div.SoaBEf")
|
||||
|
||||
if not results_on_page:
|
||||
break # No more results found
|
||||
|
||||
for el in results_on_page:
|
||||
try:
|
||||
link = el.find("a")["href"]
|
||||
title = el.select_one("div.MBeuO").get_text()
|
||||
snippet = el.select_one(".GI74Re").get_text()
|
||||
date = el.select_one(".LfVVr").get_text()
|
||||
source = el.select_one(".NUnG9d span").get_text()
|
||||
news_results.append(
|
||||
{
|
||||
"link": link,
|
||||
"title": title,
|
||||
"snippet": snippet,
|
||||
"date": date,
|
||||
"source": source,
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error processing result: {e}")
|
||||
# If one of the fields is not found, skip this result
|
||||
continue
|
||||
|
||||
# Update the progress bar with the current count of results scraped
|
||||
|
||||
# Check for the "Next" link (pagination)
|
||||
next_link = soup.find("a", id="pnnext")
|
||||
if not next_link:
|
||||
break
|
||||
|
||||
page += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed after multiple retries: {e}")
|
||||
break
|
||||
|
||||
return news_results
|
||||
Reference in New Issue
Block a user