add type hinting
PEP8 formatting
This commit is contained in:
@@ -350,7 +350,7 @@ def translate(
|
|||||||
return trans_result
|
return trans_result
|
||||||
|
|
||||||
|
|
||||||
def load_cookies(cookie_json_filename: str):
|
def load_cookies(cookie_json_filename: str) -> typing.Tuple[typing.Optional[dict], typing.Optional[str]]:
|
||||||
"""
|
"""
|
||||||
加载cookie,用于以会员方式访问非游客内容
|
加载cookie,用于以会员方式访问非游客内容
|
||||||
|
|
||||||
|
|||||||
@@ -563,8 +563,10 @@ class IniProxy():
|
|||||||
self.proxytype = proxytype
|
self.proxytype = proxytype
|
||||||
|
|
||||||
def proxies(self):
|
def proxies(self):
|
||||||
''' 获得代理参数,默认http代理
|
"""
|
||||||
'''
|
获得代理参数,默认http代理
|
||||||
|
get proxy params, use http proxy for default
|
||||||
|
"""
|
||||||
if self.address:
|
if self.address:
|
||||||
if self.proxytype in self.SUPPORT_PROXY_TYPE:
|
if self.proxytype in self.SUPPORT_PROXY_TYPE:
|
||||||
proxies = {"http": self.proxytype + "://" + self.address,
|
proxies = {"http": self.proxytype + "://" + self.address,
|
||||||
|
|||||||
76
scraper.py
76
scraper.py
@@ -1,10 +1,19 @@
|
|||||||
|
# build-in lib
|
||||||
import json
|
import json
|
||||||
import secrets
|
import secrets
|
||||||
import config
|
|
||||||
from lxml import etree
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate
|
# third party lib
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
# project wide definitions
|
||||||
|
import config
|
||||||
|
from ADC_function import (translate,
|
||||||
|
load_cookies,
|
||||||
|
file_modification_days,
|
||||||
|
delete_all_elements_in_str,
|
||||||
|
delete_all_elements_in_list
|
||||||
|
)
|
||||||
from scrapinglib.api import search
|
from scrapinglib.api import search
|
||||||
|
|
||||||
|
|
||||||
@@ -22,11 +31,11 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
|
|
||||||
# TODO 准备参数
|
# TODO 准备参数
|
||||||
# - 清理 ADC_function, webcrawler
|
# - 清理 ADC_function, webcrawler
|
||||||
proxies = None
|
proxies: dict = None
|
||||||
configProxy = conf.proxy()
|
config_proxy = conf.proxy()
|
||||||
if configProxy.enable:
|
if config_proxy.enable:
|
||||||
proxies = configProxy.proxies()
|
proxies = config_proxy.proxies()
|
||||||
|
|
||||||
javdb_sites = conf.javdb_sites().split(',')
|
javdb_sites = conf.javdb_sites().split(',')
|
||||||
for i in javdb_sites:
|
for i in javdb_sites:
|
||||||
javdb_sites[javdb_sites.index(i)] = "javdb" + i
|
javdb_sites[javdb_sites.index(i)] = "javdb" + i
|
||||||
@@ -44,19 +53,21 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
has_json = True
|
has_json = True
|
||||||
break
|
break
|
||||||
elif cdays != 9999:
|
elif cdays != 9999:
|
||||||
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
|
print(
|
||||||
|
f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
|
||||||
if not has_json:
|
if not has_json:
|
||||||
|
# get real random site from javdb_sites, because random is not really random when the seed value is known
|
||||||
javdb_site = secrets.choice(javdb_sites)
|
javdb_site = secrets.choice(javdb_sites)
|
||||||
javdb_cookies = None
|
javdb_cookies = None
|
||||||
|
|
||||||
cacert =None
|
ca_cert = None
|
||||||
if conf.cacert_file():
|
if conf.cacert_file():
|
||||||
cacert = conf.cacert_file()
|
ca_cert = conf.cacert_file()
|
||||||
|
|
||||||
json_data = search(file_number, sources, proxies=proxies, verify=cacert,
|
json_data = search(file_number, sources, proxies=proxies, verify=ca_cert,
|
||||||
dbsite=javdb_site, dbcookies=javdb_cookies,
|
dbsite=javdb_site, dbcookies=javdb_cookies,
|
||||||
morestoryline=conf.is_storyline(),
|
morestoryline=conf.is_storyline(),
|
||||||
specifiedSource=specified_source, specifiedUrl=specified_url)
|
specifiedSource=specified_source, specifiedUrl=specified_url)
|
||||||
# Return if data not found in all sources
|
# Return if data not found in all sources
|
||||||
if not json_data:
|
if not json_data:
|
||||||
print('[-]Movie Number not found!')
|
print('[-]Movie Number not found!')
|
||||||
@@ -181,8 +192,8 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
if oCC:
|
if oCC:
|
||||||
cc_vars = conf.cc_convert_vars().split(",")
|
cc_vars = conf.cc_convert_vars().split(",")
|
||||||
ccm = conf.cc_convert_mode()
|
ccm = conf.cc_convert_mode()
|
||||||
|
|
||||||
def convert_list(mapping_data,language,vars):
|
def convert_list(mapping_data, language, vars):
|
||||||
total = []
|
total = []
|
||||||
for i in vars:
|
for i in vars:
|
||||||
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0:
|
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0:
|
||||||
@@ -190,11 +201,12 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
total.append(i)
|
total.append(i)
|
||||||
return total
|
return total
|
||||||
|
|
||||||
def convert(mapping_data,language,vars):
|
def convert(mapping_data, language, vars):
|
||||||
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
|
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
|
||||||
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
|
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
|
||||||
else:
|
else:
|
||||||
raise IndexError('keyword not found')
|
raise IndexError('keyword not found')
|
||||||
|
|
||||||
for cc in cc_vars:
|
for cc in cc_vars:
|
||||||
if json_data[cc] == "" or len(json_data[cc]) == 0:
|
if json_data[cc] == "" or len(json_data[cc]) == 0:
|
||||||
continue
|
continue
|
||||||
@@ -241,7 +253,7 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
naming_rule=""
|
naming_rule = ""
|
||||||
for i in conf.naming_rule().split("+"):
|
for i in conf.naming_rule().split("+"):
|
||||||
if i not in json_data:
|
if i not in json_data:
|
||||||
naming_rule += i.strip("'").strip('"')
|
naming_rule += i.strip("'").strip('"')
|
||||||
@@ -256,17 +268,17 @@ def get_data_from_json(file_number, oCC, specified_source, specified_url):
|
|||||||
def special_characters_replacement(text) -> str:
|
def special_characters_replacement(text) -> str:
|
||||||
if not isinstance(text, str):
|
if not isinstance(text, str):
|
||||||
return text
|
return text
|
||||||
return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane
|
return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane
|
||||||
replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane
|
replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane
|
||||||
replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
|
replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
|
||||||
replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
|
replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
|
||||||
replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
|
replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
|
||||||
replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
|
replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
|
||||||
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
|
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
|
||||||
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
|
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
|
||||||
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
|
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
|
||||||
replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
|
replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
|
||||||
replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
|
replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
|
||||||
replace('…','…').
|
replace('…', '…').
|
||||||
replace('&', '&')
|
replace('&', '&')
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -25,9 +25,9 @@ from .tmdb import Tmdb
|
|||||||
from .imdb import Imdb
|
from .imdb import Imdb
|
||||||
|
|
||||||
|
|
||||||
def search(number, sources: str=None, proxies=None, verify=None, type='adult',
|
def search(number, sources: str = None, proxies=None, verify=None, type='adult',
|
||||||
specifiedSource=None, specifiedUrl=None,
|
specifiedSource=None, specifiedUrl=None,
|
||||||
dbcookies=None, dbsite=None, morestoryline=False):
|
dbcookies=None, dbsite=None, morestoryline=False):
|
||||||
""" 根据`番号/电影`名搜索信息
|
""" 根据`番号/电影`名搜索信息
|
||||||
|
|
||||||
:param number: number/name depends on type
|
:param number: number/name depends on type
|
||||||
@@ -51,11 +51,11 @@ def getSupportedSources(tag='adult'):
|
|||||||
return ','.join(sc.general_full_sources)
|
return ','.join(sc.general_full_sources)
|
||||||
|
|
||||||
|
|
||||||
class Scraping():
|
class Scraping:
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
|
adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321',
|
||||||
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91',
|
'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', 'mv91',
|
||||||
'getchu', 'gcolle'
|
'getchu', 'gcolle'
|
||||||
]
|
]
|
||||||
adult_func_mapping = {
|
adult_func_mapping = {
|
||||||
@@ -77,7 +77,7 @@ class Scraping():
|
|||||||
'javlibrary': Javlibrary().scrape,
|
'javlibrary': Javlibrary().scrape,
|
||||||
}
|
}
|
||||||
|
|
||||||
general_full_sources = ['tmdb','imdb']
|
general_full_sources = ['tmdb', 'imdb']
|
||||||
general_func_mapping = {
|
general_func_mapping = {
|
||||||
'tmdb': Tmdb().scrape,
|
'tmdb': Tmdb().scrape,
|
||||||
'imdb': Imdb().scrape,
|
'imdb': Imdb().scrape,
|
||||||
@@ -200,7 +200,8 @@ class Scraping():
|
|||||||
sources = self.adult_full_sources
|
sources = self.adult_full_sources
|
||||||
else:
|
else:
|
||||||
sources = c_sources.split(',')
|
sources = c_sources.split(',')
|
||||||
def insert(sources,source):
|
|
||||||
|
def insert(sources, source):
|
||||||
if source in sources:
|
if source in sources:
|
||||||
sources.insert(0, sources.pop(sources.index(source)))
|
sources.insert(0, sources.pop(sources.index(source)))
|
||||||
return sources
|
return sources
|
||||||
|
|||||||
Reference in New Issue
Block a user