diff --git a/config.ini b/config.ini index eeb6c60..61dd8f5 100755 --- a/config.ini +++ b/config.ini @@ -57,7 +57,7 @@ image_naming_with_number = 0 update_check = 1 [priority] -website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle +website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,mv91,getchu,javdb,gcolle,javday,javmenu [escape] literals = \()/ diff --git a/number_parser.py b/number_parser.py index fe6af19..fedad50 100755 --- a/number_parser.py +++ b/number_parser.py @@ -59,7 +59,7 @@ def get_number(debug: bool, file_path: str) -> str: if not re.search("-|_", filename): # 去掉-CD1之后再无-的情况,例如n1012-CD1.wmv return str(re.search(r'\w+', filename[:filename.find('.')], re.A).group()) file_number = os.path.splitext(filename) - filename = re.search(r'\w+(-|_)\w+', filename, re.A) + filename = re.search(r'[\w\-_]+', filename, re.A) if filename: file_number = str(filename.group()) else: @@ -85,34 +85,7 @@ def get_number(debug: bool, file_path: str) -> str: print(f'[-]Number Parser exception: {e} [{file_path}]') return None -# modou提取number -def md(filename): - m = re.search(r'(md[a-z]{0,2}-?)(\d{2,})(-ep\d*|-\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(4)}{m.group(3) or ""}' -def mmz(filename): - m = re.search(r'(mmz-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def msd(filename): - m = re.search(r'(msd-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def mky(filename): - m = re.search(r'(mky-[a-z]{2,2}-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def yk(filename): - m = re.search(r'(yk-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def pm(filename): - m = re.search(r'(pm[a-z]?-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' - -def fsog(filename): - m = re.search(r'(fsog-?)(\d{2,})(-ep\d*)*', filename, re.I) - return f'{m.group(1).replace("-","").upper()}{m.group(2).zfill(3)}{m.group(3) or ""}' # 按javdb数据源的命名规范提取number G_TAKE_NUM_RULES = { @@ -126,13 +99,6 @@ G_TAKE_NUM_RULES = { 'heyzo': lambda x: 'HEYZO-' + re.findall(r'heyzo[^\d]*(\d{4})', x, re.I)[0], 'mdbk': lambda x: str(re.search(r'mdbk(-|_)(\d{4})', x, re.I).group()), 'mdtm': lambda x: str(re.search(r'mdtm(-|_)(\d{4})', x, re.I).group()), - r'\bmd[a-z]{0,2}-\d{2,}': md, - r'\bmmz-\d{2,}':mmz, - r'\bmsd-\d{2,}':msd, - r'\bmky-[a-z]{2,2}-\d{2,}':mky, - r'\byk-\d{2,3}': yk, - r'\bpm[a-z]?-?\d{2,}':pm, - r'\bfsog-?\d{2,}':fsog } diff --git a/py_to_exe.ps1 b/py_to_exe.ps1 index 399963b..c90068d 100644 --- a/py_to_exe.ps1 +++ b/py_to_exe.ps1 @@ -1,25 +1,25 @@ -# If you can't run this script, please execute the following command in PowerShell. -# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force - -$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1) -$OPENCC_PATH=$(python -c 'import opencc as _; print(_.__path__[0])' | select -Last 1) -$FACE_RECOGNITION_MODELS=$(python -c 'import face_recognition_models as _; print(_.__path__[0])' | select -Last 1) - -mkdir build -mkdir __pycache__ - -pyinstaller --onefile Movie_Data_Capture.py ` - --hidden-import "ImageProcessing.cnn" ` - --python-option u ` - --add-data "$FACE_RECOGNITION_MODELS;face_recognition_models" ` - --add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` - --add-data "$OPENCC_PATH;opencc" ` - --add-data "Img;Img" ` - --add-data "config.ini;." ` - -rmdir -Recurse -Force build -rmdir -Recurse -Force __pycache__ -rmdir -Recurse -Force Movie_Data_Capture.spec - -echo "[Make]Finish" -pause +# If you can't run this script, please execute the following command in PowerShell. +# Set-ExecutionPolicy RemoteSigned -Scope CurrentUser -Force + +# bugfix:set submodules find path +$Env:PYTHONPATH=$pwd.path +$PYTHONPATH=$pwd.path +mkdir build +mkdir __pycache__ + +pyinstaller --collect-submodules "scrapinglib" ` + --collect-submodules "ImageProcessing" ` + --collect-data "face_recognition_models" ` + --collect-data "cloudscraper" ` + --collect-data "opencc" ` + --add-data "Img;Img" ` + --add-data "config.ini;." ` + --onefile Movie_Data_Capture.py + + +rmdir -Recurse -Force build +rmdir -Recurse -Force __pycache__ +rmdir -Recurse -Force Movie_Data_Capture.spec + +echo "[Make]Finish" +pause diff --git a/scrapinglib/__init__.py b/scrapinglib/__init__.py index ee27a25..e2144f5 100644 --- a/scrapinglib/__init__.py +++ b/scrapinglib/__init__.py @@ -1,3 +1,2 @@ # -*- coding: utf-8 -*- - -from .api import search, getSupportedSources +from .api import search, getSupportedSources \ No newline at end of file diff --git a/scrapinglib/api.py b/scrapinglib/api.py index 2bb8585..1c2e8e8 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -2,30 +2,9 @@ import re import json - +from .parser import Parser import config -from .airav import Airav -from .carib import Carib -from .dlsite import Dlsite -from .fanza import Fanza -from .gcolle import Gcolle -from .getchu import Getchu -from .jav321 import Jav321 -from .javdb import Javdb -from .fc2 import Fc2 -from .madou import Madou -from .mgstage import Mgstage -from .javbus import Javbus -from .xcity import Xcity -from .avsox import Avsox -from .javlibrary import Javlibrary -from .javday import Javday -from .pissplay import Pissplay -from .javmenu import Javmenu - -from .tmdb import Tmdb -from .imdb import Imdb - +import importlib def search(number, sources: str = None, **kwargs): """ 根据`番号/电影`名搜索信息 @@ -53,35 +32,11 @@ class Scraping: """ """ adult_full_sources = ['javlibrary', 'javdb', 'javbus', 'airav', 'fanza', 'xcity', 'jav321', - 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', - 'getchu', 'gcolle','javday','pissplay','javmenu' + 'mgstage', 'fc2', 'avsox', 'dlsite', 'carib', 'madou', + 'getchu', 'gcolle', 'javday', 'pissplay', 'javmenu' ] - adult_func_mapping = { - 'avsox': Avsox().scrape, - 'javbus': Javbus().scrape, - 'xcity': Xcity().scrape, - 'mgstage': Mgstage().scrape, - 'madou': Madou().scrape, - 'fc2': Fc2().scrape, - 'dlsite': Dlsite().scrape, - 'jav321': Jav321().scrape, - 'fanza': Fanza().scrape, - 'airav': Airav().scrape, - 'carib': Carib().scrape, - 'gcolle': Gcolle().scrape, - 'javdb': Javdb().scrape, - 'getchu': Getchu().scrape, - 'javlibrary': Javlibrary().scrape, - 'javday': Javday().scrape, - 'pissplay': Pissplay().scrape, - 'javmenu': Javmenu().scrape - } general_full_sources = ['tmdb', 'imdb'] - general_func_mapping = { - 'tmdb': Tmdb().scrape, - 'imdb': Imdb().scrape, - } debug = False @@ -126,13 +81,16 @@ class Scraping: if self.debug: print('[+]select', source) try: - data = self.general_func_mapping[source](name, self) + module = importlib.import_module('.'+source,'scrapinglib') + parser_type = getattr(module, source.capitalize()) + parser:Parser = parser_type() + data = parser.scrape(name,self) if data == 404: continue json_data = json.loads(data) except Exception as e: - # print('[!] 出错啦') - # print(e) + print('[!] 出错啦') + print(e) pass # if any service return a valid return, break if self.get_data_state(json_data): @@ -162,13 +120,16 @@ class Scraping: if self.debug: print('[+]select', source) try: - data = self.adult_func_mapping[source](number, self) + module = importlib.import_module('.'+source,'scrapinglib') + parser_type = getattr(module, source.capitalize()) + parser:Parser = parser_type() + data = parser.scrape(number,self) if data == 404: continue json_data = json.loads(data) except Exception as e: - # print('[!] 出错啦') - # print(e) + print('[!] 出错啦') + print(e) pass # json_data = self.func_mapping[source](number, self) # if any service return a valid return, break @@ -216,7 +177,7 @@ class Scraping: # check sources in func_mapping todel = [] for s in sources: - if not s in self.general_func_mapping: + if not s in self.general_full_sources: print('[!] Source Not Exist : ' + s) todel.append(s) for d in todel: @@ -235,7 +196,7 @@ class Scraping: sources.insert(0, sources.pop(sources.index(source))) return sources - if len(sources) <= len(self.adult_func_mapping): + if len(sources) <= len(self.adult_full_sources): # if the input file name matches certain rules, # move some web service to the beginning of the list lo_file_number = file_number.lower() @@ -271,7 +232,7 @@ class Scraping: # check sources in func_mapping todel = [] for s in sources: - if not s in self.adult_func_mapping and config.getInstance().debug(): + if not s in self.adult_full_sources and config.getInstance().debug(): print('[!] Source Not Exist : ' + s) todel.append(s) for d in todel: diff --git a/scrapinglib/fc2.py b/scrapinglib/fc2.py index 21629ea..7f11851 100644 --- a/scrapinglib/fc2.py +++ b/scrapinglib/fc2.py @@ -22,6 +22,7 @@ class Fc2(Parser): def extraInit(self): self.imagecut = 0 + self.allow_number_change = True def search(self, number): self.number = number.lower().replace('fc2-ppv-', '').replace('fc2-', '') diff --git a/scrapinglib/javday.py b/scrapinglib/javday.py index a462ba1..cb8edf7 100644 --- a/scrapinglib/javday.py +++ b/scrapinglib/javday.py @@ -39,3 +39,8 @@ class Javday(Parser): # 删除番号和网站名 result = title.replace(self.number,"").replace("- JAVDAY.TV","").strip() return result + + def getTags(self, htmltree) -> list: + tags = super().getTags(htmltree) + return [tag for tag in tags if 'JAVDAY.TV' not in tag] + \ No newline at end of file diff --git a/scrapinglib/madou.py b/scrapinglib/madou.py index 6e288b6..91742ff 100644 --- a/scrapinglib/madou.py +++ b/scrapinglib/madou.py @@ -6,6 +6,28 @@ from urllib.parse import urlparse, unquote from .parser import Parser +NUM_RULES3=[ + r'(mmz{2,4})-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(msd)-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(yk)-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(pm)-?(\d{2,})(-ep\d*|-\d*)?.*', + r'(mky-[a-z]{2,2})-?(\d{2,})(-ep\d*|-\d*)?.*', +] + +# modou提取number +def change_number(number): + number = number.lower().strip() + m = re.search(r'(md[a-z]{0,2})-?(\d{2,})(-ep\d*|-\d*)?.*', number, re.I) + if m: + return f'{m.group(1)}{m.group(2).zfill(4)}{m.group(3) or ""}' + for rules in NUM_RULES3: + m = re.search(rules, number, re.I) + if m: + return f'{m.group(1)}{m.group(2).zfill(3)}{m.group(3) or ""}' + return number + + + class Madou(Parser): source = 'madou' @@ -14,12 +36,15 @@ class Madou(Parser): expr_studio = '//a[@rel="category tag"]/text()' expr_tags = '/html/head/meta[@name="keywords"]/@content' + + def extraInit(self): - self.imagecut = 0 + self.imagecut = 4 self.uncensored = True + self.allow_number_change = True def search(self, number): - self.number = number.lower().strip() + self.number = change_number(number) if self.specifiedUrl: self.detailurl = self.specifiedUrl else: @@ -65,5 +90,5 @@ class Madou(Parser): def getTags(self, htmltree): studio = self.getStudio(htmltree) - x = super().getTags(htmltree) - return [i.strip() for i in x if len(i.strip()) and studio not in i and '麻豆' not in i] + tags = super().getTags(htmltree) + return [tag for tag in tags if studio not in tag and '麻豆' not in tag] diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index dbbf8e4..d3727d0 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -210,6 +210,13 @@ class Parser: def getTags(self, htmltree) -> list: alls = self.getTreeAll(htmltree, self.expr_tags) + tags = [] + for t in alls: + for tag in t.strip().split(','): + tag = tag.strip() + if tag: + tags.append(tag) + return tags return [ x.strip() for x in alls if x.strip()] def getStudio(self, htmltree):