Merge pull request #478 from lededev/carib-enable-proxy

carib.py: use proxy config settings
This commit is contained in:
Yoshiko2
2021-05-08 00:42:42 +08:00
committed by GitHub

16
WebCrawler/carib.py Normal file → Executable file
View File

@@ -4,20 +4,14 @@ import json
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from lxml import html from lxml import html
import re import re
import urllib.request
import socket
from ADC_function import * from ADC_function import *
def get_html(url):
socket.setdefaulttimeout(10)
papg = urllib.request.urlopen(url)
htm = papg.read()
htm = htm.decode("euc_jp")
return htm
def main(number: str) -> json: def main(number: str) -> json:
try: try:
caribhtml = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html') caribbytes = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
return_type="content")
caribhtml = caribbytes.decode("euc_jp")
soup = BeautifulSoup(caribhtml, "html.parser") soup = BeautifulSoup(caribhtml, "html.parser")
lx = html.fromstring(str(soup)) lx = html.fromstring(str(soup))
@@ -47,7 +41,7 @@ def main(number: str) -> json:
'source': 'carib.py', 'source': 'carib.py',
'series': '', 'series': '',
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
return js return js
def get_title(lx: html.HtmlElement) -> str: def get_title(lx: html.HtmlElement) -> str: