From 1460e2962da56f51516cf1c03e5abc7c08ac8026 Mon Sep 17 00:00:00 2001 From: lededev Date: Thu, 6 May 2021 02:07:53 +0800 Subject: [PATCH] carib.py: use proxy config settings --- WebCrawler/carib.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) mode change 100644 => 100755 WebCrawler/carib.py diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py old mode 100644 new mode 100755 index 2e5f9ec..6896683 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -4,20 +4,14 @@ import json from bs4 import BeautifulSoup from lxml import html import re -import urllib.request -import socket from ADC_function import * -def get_html(url): - socket.setdefaulttimeout(10) - papg = urllib.request.urlopen(url) - htm = papg.read() - htm = htm.decode("euc_jp") - return htm - def main(number: str) -> json: try: - caribhtml = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html') + caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', + return_type="content") + + caribhtml = caribbytes.decode("euc_jp") soup = BeautifulSoup(caribhtml, "html.parser") lx = html.fromstring(str(soup)) @@ -47,7 +41,7 @@ def main(number: str) -> json: 'source': 'carib.py', 'series': '', } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) return js def get_title(lx: html.HtmlElement) -> str: