From f11378186de02420163151e07d0c38d10e7d20f1 Mon Sep 17 00:00:00 2001
From: Mathhew <suwmlee@gmail.com>
Date: Mon, 13 Jun 2022 09:02:05 +0800
Subject: [PATCH] update lib

---
 scrapinglib/api.py         | 74 ++++++++++++++++++++++++++++++--------
 scrapinglib/fanza.py       | 11 ++++--
 scrapinglib/httprequest.py | 12 +++----
 scrapinglib/javdb.py       | 40 ++++++++++++++++-----
 scrapinglib/parser.py      | 20 +++--------
 scrapinglib/utils.py       | 31 ++++++++++++++++
 6 files changed, 140 insertions(+), 48 deletions(-)
 create mode 100644 scrapinglib/utils.py

diff --git a/scrapinglib/api.py b/scrapinglib/api.py
index cbe9138..ba85acf 100644
--- a/scrapinglib/api.py
+++ b/scrapinglib/api.py
@@ -57,10 +57,10 @@ class Scraping():
 
     """
 
-    full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', 
+    adult_full_sources = ['avsox', 'javbus', 'xcity', 'mgstage', 'madou', 'fc2', 
                     'dlsite', 'jav321', 'fanza', 'airav', 'carib', 'mv91',
                     'gcolle', 'javdb', 'getchu']
-    func_mapping = {
+    adult_func_mapping = {
         'avsox': Avsox().scrape,
         'javbus': Javbus().scrape,
         'xcity': Xcity().scrape,
@@ -78,6 +78,11 @@ class Scraping():
         'getchu': Getchu().scrape,
     }
 
+    general_full_sources = ['tmdb']
+    general_func_mapping = {
+        'tmdb': Tmdb().scrape,
+    }
+
     proxies = None
     verify = None
 
@@ -98,22 +103,45 @@ class Scraping():
         else:
             return self.searchGeneral(number, sources)
 
-    def searchGeneral(self, number, sources):
+    def searchGeneral(self, name, sources):
         """ 查询电影电视剧
         imdb,tmdb
         """
-        data = Tmdb().scrape(number, self)
-        json_data = json.loads(data)
-        return json_data
-
-    def searchAdult(self, number, sources):
-        sources = self.checkSources(sources, number)
+        sources = self.checkGeneralSources(sources, name)
         json_data = {}
         for source in sources:
             try:
                 print('[+]select', source)
                 try:
-                    data = self.func_mapping[source](number, self)
+                    data = self.general_func_mapping[source](name, self)
+                    if data == 404:
+                        continue
+                    json_data = json.loads(data)
+                except Exception as e:
+                    print('[!] 出错啦')
+                    print(e)
+                # if any service return a valid return, break
+                if self.get_data_state(json_data):
+                    print(f"[+]Find movie [{name}] metadata on website '{source}'")
+                    break
+            except:
+                continue
+
+        # Return if data not found in all sources
+        if not json_data:
+            print(f'[-]Movie Number [{name}] not found!')
+            return None
+
+        return json_data
+
+    def searchAdult(self, number, sources):
+        sources = self.checkAdultSources(sources, number)
+        json_data = {}
+        for source in sources:
+            try:
+                print('[+]select', source)
+                try:
+                    data = self.adult_func_mapping[source](number, self)
                     if data == 404:
                         continue
                     json_data = json.loads(data)
@@ -135,10 +163,26 @@ class Scraping():
 
         return json_data
 
-
-    def checkSources(self, c_sources, file_number):
+    def checkGeneralSources(self, c_sources, name):
         if not c_sources:
-            sources = self.full_sources
+            sources = self.general_full_sources
+        else:
+            sources = c_sources.split(',')
+
+        # check sources in func_mapping
+        todel = []
+        for s in sources:
+            if not s in self.general_func_mapping:
+                print('[!] Source Not Exist : ' + s)
+                todel.append(s)
+        for d in todel:
+            print('[!] Remove Source : ' + s)
+            sources.remove(d)
+        return sources
+
+    def checkAdultSources(self, c_sources, file_number):
+        if not c_sources:
+            sources = self.adult_full_sources
         else:
             sources = c_sources.split(',')
         def insert(sources,source):
@@ -146,7 +190,7 @@ class Scraping():
                 sources.insert(0, sources.pop(sources.index(source)))
             return sources
 
-        if len(sources) <= len(self.func_mapping):
+        if len(sources) <= len(self.adult_func_mapping):
             # if the input file name matches certain rules,
             # move some web service to the beginning of the list
             lo_file_number = file_number.lower()
@@ -182,7 +226,7 @@ class Scraping():
         # check sources in func_mapping
         todel = []
         for s in sources:
-            if not s in self.func_mapping:
+            if not s in self.adult_func_mapping:
                 print('[!] Source Not Exist : ' + s)
                 todel.append(s)
         for d in todel:
diff --git a/scrapinglib/fanza.py b/scrapinglib/fanza.py
index c06d26c..b3e5824 100644
--- a/scrapinglib/fanza.py
+++ b/scrapinglib/fanza.py
@@ -10,6 +10,7 @@ class Fanza(Parser):
     source = 'fanza'
 
     expr_title = '//*[starts-with(@id, "title")]/text()'
+    expr_actor = "//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
     expr_outline = "//div[@class='mg-b20 lh4']/text()"
     expr_outline2 = "//div[@class='mg-b20 lh4']//p/text()"
     expr_runtime = "//td[contains(text(),'収録時間')]/following-sibling::td/text()"
@@ -132,10 +133,16 @@ class Fanza(Parser):
         return ''
     
     def getLabel(self, htmltree):
-        return self.getFanzaStrings('レーベル')
+        ret = self.getFanzaStrings('レーベル')
+        if ret == "----":
+            return ''
+        return ret
 
     def getSeries(self, htmltree):
-        return self.getFanzaStrings('シリーズ：')
+        ret = self.getFanzaStrings('シリーズ：')
+        if ret == "----":
+            return ''
+        return ret
 
     def getFanzaString(self, expr):
         result1 = str(self.htmltree.xpath("//td[contains(text(),'"+expr+"')]/following-sibling::td/a/text()")).strip(" ['']")
diff --git a/scrapinglib/httprequest.py b/scrapinglib/httprequest.py
index 0c677c0..997ff39 100644
--- a/scrapinglib/httprequest.py
+++ b/scrapinglib/httprequest.py
@@ -23,8 +23,7 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type:
     for i in range(retry):
         try:
             result = requests.get(url, headers=headers, timeout=timeout, proxies=proxies,
-                                  verify=verify,
-                                  cookies=cookies)
+                                  verify=verify, cookies=cookies)
             if return_type == "object":
                 return result
             elif return_type == "content":
@@ -44,8 +43,8 @@ def get(url: str, cookies=None, ua: str = None, extra_headers=None, return_type:
     raise Exception('Connect Failed')
 
 
-def post(url: str, data: dict, cookies = None, ua: str = None, return_type: str = None, encoding: str = None,
-        retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
+def post(url: str, data: dict, files=None, cookies=None, ua: str = None, return_type: str = None, encoding: str = None,
+         retry: int = 3, timeout: int = G_DEFAULT_TIMEOUT, proxies=None, verify=None):
     """
     是否使用代理应由上层处理
     """
@@ -54,9 +53,8 @@ def post(url: str, data: dict, cookies = None, ua: str = None, return_type: str
 
     for i in range(retry):
         try:
-            result = requests.post(url, data=data, headers=headers, timeout=timeout, proxies=proxies,
-                                  verify=verify,
-                                  cookies=cookies)
+            result = requests.post(url, data=data, files=files, headers=headers, timeout=timeout, proxies=proxies,
+                                   verify=verify, cookies=cookies)
             if return_type == "object":
                 return result
             elif return_type == "content":
diff --git a/scrapinglib/javdb.py b/scrapinglib/javdb.py
index a5ad159..839c166 100644
--- a/scrapinglib/javdb.py
+++ b/scrapinglib/javdb.py
@@ -4,7 +4,6 @@
 import re
 from urllib.parse import urljoin
 from lxml import etree
-from requests import session
 from .httprequest import get_html_session
 from .parser import Parser
 
@@ -13,22 +12,26 @@ class Javdb(Parser):
     source = 'javdb'
 
     fixstudio = False
+    noauth = False
 
     expr_number = '//strong[contains(text(),"番號")]/../span/text()'
     expr_number2 = '//strong[contains(text(),"番號")]/../span/a/text()'
     expr_title = "/html/head/title/text()"
+    expr_title_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/text()'
     expr_runtime = '//strong[contains(text(),"時長")]/../span/text()'
     expr_runtime2 = '//strong[contains(text(),"時長")]/../span/a/text()'
     expr_uncensored = '//strong[contains(text(),"類別")]/../span/a[contains(@href,"/tags/uncensored?") or contains(@href,"/tags/western?")]'
     expr_actor = '//span[@class="value"]/a[contains(@href,"/actors/")]/text()'
     expr_actor2 = '//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class'
     expr_release = '//strong[contains(text(),"日期")]/../span/text()'
+    expr_release_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "meta")]/text()'
     expr_studio = '//strong[contains(text(),"片商")]/../span/a/text()'
     expr_studio2 = '//strong[contains(text(),"賣家:")]/../span/a/text()'
     expr_director = '//strong[contains(text(),"導演")]/../span/text()'
     expr_director2 = '//strong[contains(text(),"導演")]/../span/a/text()'
     expr_cover = "//div[contains(@class, 'column-video-cover')]/a/img/@src"
     expr_cover2 = "//div[contains(@class, 'column-video-cover')]/img/@src"
+    expr_cover_no = '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "cover")]/img/@src'
     expr_extrafanart = "//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href"
     expr_tags = '//strong[contains(text(),"類別")]/../span/a/text()'
     expr_tags2 = '//strong[contains(text(),"類別")]/../span/text()'
@@ -57,14 +60,18 @@ class Javdb(Parser):
         else:
             self.dbsite = 'javdb'
 
-    def search(self, number):
+    def search(self, number: str):
         self.number = number
         self.session = get_html_session(cookies=self.cookies, proxies=self.proxies, verify=self.verify)
         self.detailurl = self.queryNumberUrl(number)
-
         self.deatilpage = self.session.get(self.detailurl).text
-        htmltree = etree.fromstring(self.deatilpage, etree.HTMLParser())
-        result = self.dictformat(htmltree)
+        if '此內容需要登入才能查看或操作' in self.deatilpage or '需要VIP權限才能訪問此內容' in self.deatilpage:
+            self.noauth = True
+            self.imagecut = 0
+            result = self.dictformat(self.querytree)
+        else:
+            htmltree = etree.fromstring(self.deatilpage, etree.HTMLParser())
+            result = self.dictformat(htmltree)
         return result
 
     def queryNumberUrl(self, number):
@@ -75,18 +82,19 @@ class Javdb(Parser):
             print(e)
             raise Exception(f'[!] {self.number}: page not fond in javdb')
 
-        htmltree = etree.fromstring(resp.text, etree.HTMLParser()) 
+        self.querytree = etree.fromstring(resp.text, etree.HTMLParser()) 
         # javdb sometime returns multiple results,
         # and the first elememt maybe not the one we are looking for
         # iterate all candidates and find the match one
-        urls = self.getAll(htmltree, '//*[contains(@class,"movie-list")]/div/a/@href')
+        urls = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/@href')
         # 记录一下欧美的ids  ['Blacked','Blacked']
         if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number):
             correct_url = urls[0]
         else:
-            ids = self.getAll(htmltree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
+            ids = self.getAll(self.querytree, '//*[contains(@class,"movie-list")]/div/a/div[contains(@class, "video-title")]/strong/text()')
             try:
-                correct_url = urls[ids.index(number)]
+                self.queryid = ids.index(number)
+                correct_url = urls[self.queryid]
             except:
                 # 为避免获得错误番号，只要精确对应的结果
                 if ids[0].upper() != number:
@@ -95,6 +103,8 @@ class Javdb(Parser):
         return urljoin(resp.url, correct_url)
 
     def getNum(self, htmltree):
+        if self.noauth:
+            return self.number
         result1 = str(self.getAll(htmltree, self.expr_number)).strip(" ['']")
         result2 = str(self.getAll(htmltree, self.expr_number2)).strip(" ['']")
         dp_number = str(result2 + result1).strip('+')
@@ -105,10 +115,22 @@ class Javdb(Parser):
         return self.number
 
     def getTitle(self, htmltree):
+        if self.noauth:
+            return self.getTreeIndex(htmltree, self.expr_title_no, self.queryid)
         browser_title = super().getTitle(htmltree)
         title = browser_title[:browser_title.find(' | JavDB')].strip()
         return title.replace(self.number, '').strip()
 
+    def getCover(self, htmltree):
+        if self.noauth:
+            return self.getTreeIndex(htmltree, self.expr_cover_no, self.queryid)
+        return super().getCover(htmltree)
+
+    def getRelease(self, htmltree):
+        if self.noauth:
+            return self.getTreeIndex(htmltree, self.expr_release_no, self.queryid).strip()
+        return super().getRelease(htmltree)
+
     def getRuntime(self, htmltree):
         result1 = str(self.getAll(htmltree, self.expr_runtime)).strip(" ['']")
         result2 = str(self.getAll(htmltree, self.expr_runtime2)).strip(" ['']")
diff --git a/scrapinglib/parser.py b/scrapinglib/parser.py
index c27665c..14493d8 100644
--- a/scrapinglib/parser.py
+++ b/scrapinglib/parser.py
@@ -3,12 +3,14 @@
 import json
 import re
 from lxml import etree, html
-from . import httprequest
 
+from . import httprequest
+from .utils import getTreeElement, getTreeAll
 
 class Parser:
 
     source = 'base'
+    # poster: `0` 复制 `1` 裁剪 
     imagecut = 1
     uncensored = False
     allow_number_change = False
@@ -249,21 +251,9 @@ class Parser:
     def getTreeIndex(self, tree: html.HtmlElement, expr, index=0):
         """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
         """
-        if expr == '':
-            return ''
-        result = tree.xpath(expr)
-        try:
-            return result[index]
-        except:
-            return ''
+        return getTreeElement(tree, expr, index)
 
     def getAll(self, tree: html.HtmlElement, expr):
         """ 根据表达式从`xmltree`中获取全部匹配值
         """
-        if expr == '':
-            return ''
-        result = tree.xpath(expr)
-        try:
-            return result
-        except:
-            return ''
+        return getTreeAll(tree, expr)
diff --git a/scrapinglib/utils.py b/scrapinglib/utils.py
new file mode 100644
index 0000000..490d34f
--- /dev/null
+++ b/scrapinglib/utils.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+from lxml.html import HtmlElement
+
+def getTreeElement(tree: HtmlElement, expr, index=0):
+    """ 根据表达式从`xmltree`中获取匹配值,默认 index 为 0
+    :param tree (html.HtmlElement)
+    :param expr 
+    :param index
+    """
+    if expr == '':
+        return ''
+    result = tree.xpath(expr)
+    try:
+        return result[index]
+    except:
+        return ''
+
+def getTreeAll(tree: HtmlElement, expr):
+    """ 根据表达式从`xmltree`中获取全部匹配值
+    :param tree (html.HtmlElement)
+    :param expr 
+    :param index
+    """
+    if expr == '':
+        return ''
+    result = tree.xpath(expr)
+    try:
+        return result
+    except:
+        return ''