From 78619f5909ccb41569906d0b17a774f63c395eb1 Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Sat, 8 Jul 2023 04:51:30 +0800 Subject: [PATCH 1/8] Set lower pillow version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index da1c7e3..e481111 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ numpy face-recognition-models lxml beautifulsoup4 -pillow +pillow==9.5.0 cloudscraper pysocks==1.7.1 urllib3==1.25.11 From 47a271f938e2d341ce055173568f649ee99b4b3e Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Sat, 8 Jul 2023 04:52:23 +0800 Subject: [PATCH 2/8] Update sites --- scrapinglib/api.py | 7 ++++--- scrapinglib/dlsite.py | 5 +++++ scrapinglib/getchu.py | 15 +++++++++++++-- scrapinglib/parser.py | 1 - 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/scrapinglib/api.py b/scrapinglib/api.py index e1b3c09..1889206 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -221,8 +221,9 @@ class Scraping: sources = ["pcolle"] elif "fc2" in lo_file_number: if "fc2" in sources: - sources = insert(sources, "msin") - sources = insert(sources, "fc2") + sources = ["msin", "fc2"] + # sources = insert(sources, "msin") + # sources = insert(sources, "fc2") elif "mgstage" in sources and \ (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number): sources = insert(sources, "mgstage") @@ -245,7 +246,7 @@ class Scraping: todel.append(s) for d in todel: if config.getInstance().debug(): - print('[!] Remove Source : ' + s) + print('[!] Remove Source : ' + d) sources.remove(d) return sources diff --git a/scrapinglib/dlsite.py b/scrapinglib/dlsite.py index d307f02..5c6f003 100644 --- a/scrapinglib/dlsite.py +++ b/scrapinglib/dlsite.py @@ -97,3 +97,8 @@ class Dlsite(Parser): except: result = '' return result + + def getTags(self, htmltree): + tags = super().getTags(htmltree) + tags.append("DLsite") + return tags diff --git a/scrapinglib/getchu.py b/scrapinglib/getchu.py index 5c1649d..d5b67ef 100644 --- a/scrapinglib/getchu.py +++ b/scrapinglib/getchu.py @@ -95,9 +95,15 @@ class wwwGetchu(Parser): def extradict(self, dic: dict): """ 额外新增的 headers """ - dic['headers'] = {'referer': self.detailurl} + dic['headers'] = {'referer': self.detailurl} return dic + def getTags(self, htmltree): + tags = super().getTags(htmltree) + tags.append("Getchu") + return tags + + class dlGetchu(wwwGetchu): """ 二者基本一致 headers extrafanart 略有区别 @@ -140,7 +146,7 @@ class dlGetchu(wwwGetchu): def extradict(self, dic: dict): return dic - + def getExtrafanart(self, htmltree): arts = self.getTreeAll(htmltree, self.expr_extrafanart) extrafanart = [] @@ -148,3 +154,8 @@ class dlGetchu(wwwGetchu): i = "https://dl.getchu.com" + i extrafanart.append(i) return extrafanart + + def getTags(self, htmltree): + tags = super().getTags(htmltree) + tags.append("Getchu") + return tags diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py index b0cdfb7..05a7994 100644 --- a/scrapinglib/parser.py +++ b/scrapinglib/parser.py @@ -219,7 +219,6 @@ class Parser: if tag: tags.append(tag) return tags - return [ x.strip() for x in alls if x.strip()] def getStudio(self, htmltree): return self.getTreeElementbyExprs(htmltree, self.expr_studio, self.expr_studio2) From 43e9d7727e60fbd3292a673f737fa312da0f58c3 Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Sat, 8 Jul 2023 04:55:31 +0800 Subject: [PATCH 3/8] Update 6.6.6 --- Movie_Data_Capture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Movie_Data_Capture.py b/Movie_Data_Capture.py index 99ddf6c..a96657a 100644 --- a/Movie_Data_Capture.py +++ b/Movie_Data_Capture.py @@ -681,7 +681,7 @@ def period(delta, pattern): if __name__ == '__main__': - version = '6.6.5' + version = '6.6.6' urllib3.disable_warnings() # Ignore http proxy warning app_start = time.time() From 9b5af4beddc141d37f7fe89711b97f9ddc673f62 Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Sat, 8 Jul 2023 04:57:57 +0800 Subject: [PATCH 4/8] Update 6.6.6 --- config.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.ini b/config.ini index 2afe5cf..dfb0375 100755 --- a/config.ini +++ b/config.ini @@ -58,7 +58,7 @@ image_naming_with_number = 0 update_check = 1 [priority] -website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,getchu,javdb,gcolle,javday,javmenu,pcolle,caribpr,msin +website = javdb,javdb,javdb [escape] literals = \()/ @@ -129,7 +129,7 @@ mode = 1 vars = outline,series,studio,tag,title [javdb] -sites = 38,39,40 +sites = 521 ; 人脸识别 locations_model=hog:方向梯度直方图(不太准确,速度快) cnn:深度学习模型(准确,需要GPU/CUDA,速度慢) ; uncensored_only=0:对全部封面进行人脸识别 1:只识别无码封面,有码封面直接切右半部分 From c3e5fdb09fefa11c614e519786e942b08b2a8fb0 Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Sat, 8 Jul 2023 04:59:27 +0800 Subject: [PATCH 5/8] Update 6.6.6 --- config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.ini b/config.ini index dfb0375..7aa6487 100755 --- a/config.ini +++ b/config.ini @@ -58,7 +58,7 @@ image_naming_with_number = 0 update_check = 1 [priority] -website = javdb,javdb,javdb +website = javbus,airav,jav321,fanza,xcity,mgstage,fc2,avsox,dlsite,carib,madou,getchu,javdb,gcolle,javday,javmenu,pcolle,caribpr,msin [escape] literals = \()/ From 3597a9590d94175dc61a64e60c88fe0025e24e99 Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Sat, 8 Jul 2023 16:53:01 +0800 Subject: [PATCH 6/8] Add site pxolle --- scrapinglib/pcolle.py | 58 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 scrapinglib/pcolle.py diff --git a/scrapinglib/pcolle.py b/scrapinglib/pcolle.py new file mode 100644 index 0000000..3cfc620 --- /dev/null +++ b/scrapinglib/pcolle.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +import re +from lxml import etree +from .httprequest import request_session +from .parser import Parser + + +class Pcolle(Parser): + source = 'pcolle' + + expr_number = '//th[contains(text(),"商品ID")]/../td/text()' + expr_title = '//div[@class="title-04"]/div/text()' + expr_studio = '//th[contains(text(),"販売会員")]/../td/a/text()' + expr_director = '//th[contains(text(),"販売会員")]/../td/a/text()' + expr_actor = '//th[contains(text(),"販売会員")]/../td/a/text()' + expr_label = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()' + expr_series = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()' + expr_release = '//th[contains(text(),"販売開始日")]/../td/text()' + expr_cover = '/html/body/div[1]/div/div[4]/div[2]/div/div[1]/div/article/a/img/@src' + expr_tags = '//p[contains(text(),"商品タグ")]/../ul/li/a/text()' + expr_outline = '//p[@class="fo-14"]/text()' + expr_extrafanart = '//*[@class="item-nav"]/ul/li/a/img/@src' + + # expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src' + + def extraInit(self): + self.imagecut = 4 + + def search(self, number: str): + self.number = number.upper().replace('PCOLLE-', '') + self.detailurl = 'https://www.pcolle.com/product/detail/?product_id=' + self.number + session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify) + htmlcode = session.get(self.detailurl).text + htmltree = etree.HTML(htmlcode) + result = self.dictformat(htmltree) + return result + + def getNum(self, htmltree): + num = super().getNum(htmltree).upper() + if self.number != num: + raise Exception(f'[!] {self.number}: find [{num}] in pcolle, not match') + return "PCOLLE-" + str(num) + + def getOutline(self, htmltree): + result = self.getTreeAll(htmltree, self.expr_outline) + try: + return "\n".join(result) + except: + return "" + + def getRelease(self, htmltree): + return super().getRelease(htmltree).replace('年', '-').replace('月', '-').replace('日', '') + + def getCover(self, htmltree): + if ".gif" in super().getCover(htmltree) and len(super().getExtrafanart(htmltree)) != 0: + return super().getExtrafanart(htmltree)[0] + return super().getCover(htmltree) From 0a263f665ceb1f20cb812438f6874635b5126b40 Mon Sep 17 00:00:00 2001 From: Hakusai Zhang Date: Sat, 8 Jul 2023 18:20:30 +0800 Subject: [PATCH 7/8] only process file with suffix, not directory. --- Movie_Data_Capture.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Movie_Data_Capture.py b/Movie_Data_Capture.py index a96657a..aece3c4 100644 --- a/Movie_Data_Capture.py +++ b/Movie_Data_Capture.py @@ -351,6 +351,8 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]: for full_name in source.glob(r'**/*'): if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set: continue + if not full_name.is_file(): + continue if not full_name.suffix.lower() in file_type: continue absf = str(full_name) From 26b82b17255b134e2cbfa296d342decd1ae2273a Mon Sep 17 00:00:00 2001 From: yoshiko2 Date: Sun, 9 Jul 2023 00:42:29 +0800 Subject: [PATCH 8/8] Update sites #2 --- scrapinglib/api.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/scrapinglib/api.py b/scrapinglib/api.py index 1889206..cf51ea4 100644 --- a/scrapinglib/api.py +++ b/scrapinglib/api.py @@ -220,10 +220,7 @@ class Scraping: elif "pcolle" in sources and "pcolle" in lo_file_number: sources = ["pcolle"] elif "fc2" in lo_file_number: - if "fc2" in sources: - sources = ["msin", "fc2"] - # sources = insert(sources, "msin") - # sources = insert(sources, "fc2") + sources = ["fc2", "msin"] elif "mgstage" in sources and \ (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number): sources = insert(sources, "mgstage")