diff --git a/ADC_function.py b/ADC_function.py index ad5871f..4fcbec1 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -350,7 +350,7 @@ def translate( return trans_result -def load_cookies(cookie_json_filename: str): +def load_cookies(cookie_json_filename: str) -> typing.Tuple[typing.Optional[dict], typing.Optional[str]]: """ 加载cookie,用于以会员方式访问非游客内容 diff --git a/config.py b/config.py index 745b33b..30efb81 100644 --- a/config.py +++ b/config.py @@ -563,8 +563,10 @@ class IniProxy(): self.proxytype = proxytype def proxies(self): - ''' 获得代理参数,默认http代理 - ''' + """ + 获得代理参数,默认http代理 + get proxy params, use http proxy for default + """ if self.address: if self.proxytype in self.SUPPORT_PROXY_TYPE: proxies = {"http": self.proxytype + "://" + self.address, diff --git a/scraper.py b/scraper.py index ab95e3f..b27245b 100644 --- a/scraper.py +++ b/scraper.py @@ -1,10 +1,19 @@ +# build-in lib import json import secrets -import config -from lxml import etree from pathlib import Path -from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate +# third party lib +from lxml import etree + +# project wide definitions +import config +from ADC_function import (translate, + load_cookies, + file_modification_days, + delete_all_elements_in_str, + delete_all_elements_in_list + ) from scrapinglib.api import search @@ -22,11 +31,11 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): # TODO 准备参数 # - 清理 ADC_function, webcrawler - proxies = None - configProxy = conf.proxy() - if configProxy.enable: - proxies = configProxy.proxies() - + proxies: dict = None + config_proxy = conf.proxy() + if config_proxy.enable: + proxies = config_proxy.proxies() + javdb_sites = conf.javdb_sites().split(',') for i in javdb_sites: javdb_sites[javdb_sites.index(i)] = "javdb" + i @@ -44,19 +53,21 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): has_json = True break elif cdays != 9999: - print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') + print( + f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') if not has_json: + # get real random site from javdb_sites, because random is not really random when the seed value is known javdb_site = secrets.choice(javdb_sites) javdb_cookies = None - cacert =None + ca_cert = None if conf.cacert_file(): - cacert = conf.cacert_file() + ca_cert = conf.cacert_file() - json_data = search(file_number, sources, proxies=proxies, verify=cacert, - dbsite=javdb_site, dbcookies=javdb_cookies, - morestoryline=conf.is_storyline(), - specifiedSource=specified_source, specifiedUrl=specified_url) + json_data = search(file_number, sources, proxies=proxies, verify=ca_cert, + dbsite=javdb_site, dbcookies=javdb_cookies, + morestoryline=conf.is_storyline(), + specifiedSource=specified_source, specifiedUrl=specified_url) # Return if data not found in all sources if not json_data: print('[-]Movie Number not found!') @@ -181,8 +192,8 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): if oCC: cc_vars = conf.cc_convert_vars().split(",") ccm = conf.cc_convert_mode() - - def convert_list(mapping_data,language,vars): + + def convert_list(mapping_data, language, vars): total = [] for i in vars: if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0: @@ -190,11 +201,12 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): total.append(i) return total - def convert(mapping_data,language,vars): + def convert(mapping_data, language, vars): if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0: return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0] else: raise IndexError('keyword not found') + for cc in cc_vars: if json_data[cc] == "" or len(json_data[cc]) == 0: continue @@ -241,7 +253,7 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): except: pass - naming_rule="" + naming_rule = "" for i in conf.naming_rule().split("+"): if i not in json_data: naming_rule += i.strip("'").strip('"') @@ -256,17 +268,17 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url): def special_characters_replacement(text) -> str: if not isinstance(text, str): return text - return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane - replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane - replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D - replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane - replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane - replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane - replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane - replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane - replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane - replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK - replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK - replace('…','…'). - replace('&', '&') + return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane + replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane + replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D + replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane + replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane + replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane + replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane + replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane + replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK + replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK + replace('…', '…'). + replace('&', '&') ) diff --git a/scrapinglib/api.py b/scrapinglib/api.py index 69a59c9..728e935 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -25,9 +25,9 @@ from .tmdb import Tmdb from .imdb import Imdb -def search(number, sources: str=None, proxies=None, verify=None, type='adult', - specifiedSource=None, specifiedUrl=None, - dbcookies=None, dbsite=None, morestoryline=False): +def search(number, sources: str = None, proxies=None, verify=None, type='adult', + specifiedSource=None, specifiedUrl=None, + dbcookies=None, dbsite=None, morestoryline=False): """ 根据`番号/电影`名搜索信息 :param number: number/name depends on type @@ -51,11 +51,11 @@ def getSupportedSources(tag='adult'): return ','.join(sc.general_full_sources) -class Scraping(): +class Scraping: """ """ - adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', - 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91', + adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', + 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91', 'getchu', 'gcolle' ] adult_func_mapping = { @@ -77,7 +77,7 @@ class Scraping(): 'javlibrary': Javlibrary().scrape, } - general_full_sources = ['tmdb','imdb'] + general_full_sources = ['tmdb', 'imdb'] general_func_mapping = { 'tmdb': Tmdb().scrape, 'imdb': Imdb().scrape, @@ -200,7 +200,8 @@ class Scraping(): sources = self.adult_full_sources else: sources = c_sources.split(',') - def insert(sources,source): + + def insert(sources, source): if source in sources: sources.insert(0, sources.pop(sources.index(source))) return sources