Merge pull request #1 from yoshiko2/master

将GETCHU设置指定编码(euc-jp)
2023-07-09 17:04:19 +08:00
parent fb21b07dcf 33b4a04aa3
commit 6175fc8e05
8 changed files with 83 additions and 10 deletions
--- a/Movie_Data_Capture.py
+++ b/Movie_Data_Capture.py
@@ -351,6 +351,8 @@ def movie_lists(source_folder, regexstr: str) -> typing.List[str]:
    for full_name in source.glob(r'**/*'):
        if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set:
            continue
+        if not full_name.is_file():
+            continue
        if not full_name.suffix.lower() in file_type:
            continue
        absf = str(full_name)
@@ -681,7 +683,7 @@ def period(delta, pattern):


 if __name__ == '__main__':
-    version = '6.6.5'
+    version = '6.6.6'
    urllib3.disable_warnings()  # Ignore http proxy warning
    app_start = time.time()

--- a/config.ini
+++ b/config.ini
@@ -129,7 +129,7 @@ mode = 1
 vars = outline,series,studio,tag,title

 [javdb]
-sites = 38,39,40
+sites = 521

 ; 人脸识别 locations_model=hog:方向梯度直方图(不太准确，速度快) cnn:深度学习模型(准确，需要GPU/CUDA,速度慢)
 ; uncensored_only=0:对全部封面进行人脸识别 1:只识别无码封面，有码封面直接切右半部分
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ numpy
 face-recognition-models
 lxml
 beautifulsoup4
-pillow
+pillow==9.5.0
 cloudscraper
 pysocks==1.7.1
 urllib3==1.25.11
--- a/scrapinglib/api.py
+++ b/scrapinglib/api.py
@@ -220,9 +220,7 @@ class Scraping:
            elif "pcolle" in sources and "pcolle" in lo_file_number:
                sources = ["pcolle"]
            elif "fc2" in lo_file_number:
-                if "fc2" in sources:
-                    sources = insert(sources, "msin")
-                    sources = insert(sources, "fc2")
+                sources = ["fc2", "msin"]
            elif "mgstage" in sources and \
                    (re.search(r"\d+\D+", file_number) or "siro" in lo_file_number):
                sources = insert(sources, "mgstage")
@@ -245,7 +243,7 @@ class Scraping:
                todel.append(s)
        for d in todel:
            if config.getInstance().debug():
-                print('[!] Remove Source : ' + s)
+                print('[!] Remove Source : ' + d)
            sources.remove(d)
        return sources

--- a/scrapinglib/dlsite.py
+++ b/scrapinglib/dlsite.py
@@ -97,3 +97,8 @@ class Dlsite(Parser):
        except:
            result = ''
        return result
+
+    def getTags(self, htmltree):
+        tags = super().getTags(htmltree)
+        tags.append("DLsite")
+        return tags
--- a/scrapinglib/getchu.py
+++ b/scrapinglib/getchu.py
@@ -109,9 +109,15 @@ class wwwGetchu(Parser):
    def extradict(self, dic: dict):
        """ 额外新增的  headers
        """
-        dic['headers'] =  {'referer': self.detailurl}
+        dic['headers'] = {'referer': self.detailurl}
        return dic

+    def getTags(self, htmltree):
+        tags = super().getTags(htmltree)
+        tags.append("Getchu")
+        return tags
+
+
 class dlGetchu(wwwGetchu):
    """ 二者基本一致
    headers extrafanart 略有区别
@@ -154,7 +160,7 @@ class dlGetchu(wwwGetchu):

    def extradict(self, dic: dict):
        return dic
-    
+
    def getExtrafanart(self, htmltree):
        arts = self.getTreeAll(htmltree, self.expr_extrafanart)
        extrafanart = []
@@ -162,3 +168,8 @@ class dlGetchu(wwwGetchu):
            i = "https://dl.getchu.com" + i
            extrafanart.append(i)
        return extrafanart
+
+    def getTags(self, htmltree):
+        tags = super().getTags(htmltree)
+        tags.append("Getchu")
+        return tags
--- a/scrapinglib/parser.py
+++ b/scrapinglib/parser.py
@@ -219,7 +219,6 @@ class Parser:
                if tag:
                    tags.append(tag)
        return tags
-        return [ x.strip() for x in alls if x.strip()]

    def getStudio(self, htmltree):
        return self.getTreeElementbyExprs(htmltree, self.expr_studio, self.expr_studio2)
--- a/scrapinglib/pcolle.py
+++ b/scrapinglib/pcolle.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+import re
+from lxml import etree
+from .httprequest import request_session
+from .parser import Parser
+
+
+class Pcolle(Parser):
+    source = 'pcolle'
+
+    expr_number = '//th[contains(text(),"商品ID")]/../td/text()'
+    expr_title = '//div[@class="title-04"]/div/text()'
+    expr_studio = '//th[contains(text(),"販売会員")]/../td/a/text()'
+    expr_director = '//th[contains(text(),"販売会員")]/../td/a/text()'
+    expr_actor = '//th[contains(text(),"販売会員")]/../td/a/text()'
+    expr_label = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()'
+    expr_series = '//th[contains(text(),"カテゴリー")]/../td/ul/li/a/text()'
+    expr_release = '//th[contains(text(),"販売開始日")]/../td/text()'
+    expr_cover = '/html/body/div[1]/div/div[4]/div[2]/div/div[1]/div/article/a/img/@src'
+    expr_tags = '//p[contains(text(),"商品タグ")]/../ul/li/a/text()'
+    expr_outline = '//p[@class="fo-14"]/text()'
+    expr_extrafanart = '//*[@class="item-nav"]/ul/li/a/img/@src'
+
+    # expr_extrafanart2 = '//*[@id="cart_quantity"]/table/tr[3]/td/div/a/img/@src'
+
+    def extraInit(self):
+        self.imagecut = 4
+
+    def search(self, number: str):
+        self.number = number.upper().replace('PCOLLE-', '')
+        self.detailurl = 'https://www.pcolle.com/product/detail/?product_id=' + self.number
+        session = request_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
+        htmlcode = session.get(self.detailurl).text
+        htmltree = etree.HTML(htmlcode)
+        result = self.dictformat(htmltree)
+        return result
+
+    def getNum(self, htmltree):
+        num = super().getNum(htmltree).upper()
+        if self.number != num:
+            raise Exception(f'[!] {self.number}: find [{num}] in pcolle, not match')
+        return "PCOLLE-" + str(num)
+
+    def getOutline(self, htmltree):
+        result = self.getTreeAll(htmltree, self.expr_outline)
+        try:
+            return "\n".join(result)
+        except:
+            return ""
+
+    def getRelease(self, htmltree):
+        return super().getRelease(htmltree).replace('年', '-').replace('月', '-').replace('日', '')
+
+    def getCover(self, htmltree):
+        if ".gif" in super().getCover(htmltree) and len(super().getExtrafanart(htmltree)) != 0:
+            return super().getExtrafanart(htmltree)[0]
+        return super().getCover(htmltree)