Add support javlib

2020-04-23 10:05:53 +09:00
parent 274891fc21
commit b65581de47
6 changed files with 190 additions and 19 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -25,11 +25,35 @@ jobs:

    - name: Install dependencies
      run: |
+        python -m pip install --upgrade pip
        pip install -r requirements.txt

-    - name: Build with pyinstaller
+#    - name: Show cloudsraper package location
+#      run: |
+#        python -c 'import cloudscraper as _; print(_.__path__)'
+
+    - name: Build with pyinstaller (windows)
+      if: matrix.os == 'windows-latest'
      run: |
-        pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py
+        pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data='C:\\hostedtoolcache\\windows\\Python\\3.7.6\\x64\\lib\\site-packages\\cloudscraper\\;cloudscraper'
+
+    - name: Build with pyinstaller (mac)
+      if: matrix.os == 'macos-latest'
+      run: |
+        pyinstaller \
+          --onefile AV_Data_Capture.py \
+          --hidden-import ADC_function.py \
+          --hidden-import core.py \
+          --add-data='/Users/runner/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper'
+
+    - name: Build with pyinstaller (ubuntu)
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        pyinstaller \
+          --onefile AV_Data_Capture.py \
+          --hidden-import ADC_function.py \
+          --hidden-import core.py \
+          --add-data='/opt/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper'

    - name: Copy config.ini
      run: |
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -1,5 +1,6 @@
 import requests
 from lxml import etree
+import cloudscraper

 import config

@@ -23,23 +24,39 @@ def getXpathSingle(htmlcode,xpath):
    return result1


+def get_proxy(proxy: str) -> dict:
+    if proxy:
+        proxies = {"http": "http://" + proxy, "https": "https://" + proxy}
+    else:
+        proxies = {}
+
+    return proxies
+
+
 # 网页请求核心
-def get_html(url, cookies=None):
+def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
    proxy, timeout, retry_count = config.Config().proxy()
+    proxies = get_proxy(proxy)
+
+    if ua is None:
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"} # noqa
+    else:
+        headers = {"User-Agent": ua}

    for i in range(retry_count):
        try:
            if not proxy == '':
-                proxies = {"http": "http://" + proxy,"https": "https://" + proxy}
-                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'}
-                getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies)
-                getweb.encoding = 'utf-8'
-                return getweb.text
+                result = requests.get(str(url), headers=headers, timeout=timeout, proxies=proxies, cookies=cookies)
            else:
-                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
-                getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
-                getweb.encoding = 'utf-8'
-                return getweb.text
+                result = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
+
+            result.encoding = "utf-8"
+
+            if return_type == "object":
+                return result
+            else:
+                return result.text
+
        except requests.exceptions.ProxyError:
            print("[-]Connect retry {}/{}".format(i + 1, retry_count))
    print('[-]Connect Failed! Please check your Proxy or Network!')
@@ -49,11 +66,7 @@ def get_html(url, cookies=None):

 def post_html(url: str, query: dict) -> requests.Response:
    proxy, timeout, retry_count = config.Config().proxy()
-
-    if proxy:
-        proxies = {"http": "http://" + proxy, "https": "https://" + proxy}
-    else:
-        proxies = {}
+    proxies = get_proxy(proxy)

    for i in range(retry_count):
        try:
@@ -64,3 +77,25 @@ def post_html(url: str, query: dict) -> requests.Response:
    print("[-]Connect Failed! Please check your Proxy or Network!")
    input("Press ENTER to exit!")
    exit()
+
+
+def get_javlib_cookie() -> [dict, str]:
+    proxy, timeout, retry_count = config.Config().proxy()
+    proxies = get_proxy(proxy)
+
+    raw_cookie = {}
+    user_agent = ""
+
+    # Get __cfduid/cf_clearance and user-agent
+    for i in range(retry_count):
+        try:
+            raw_cookie, user_agent = cloudscraper.get_cookie_string(
+                "http://www.m45e.com/",
+                proxies=proxies
+            )
+        except requests.exceptions.ProxyError:
+            print("[-] ProxyError, retry {}/{}".format(i+1, retry_count))
+        except cloudscraper.exceptions.CloudflareIUAMError:
+            print("[-] IUAMError, retry {}/{}".format(i+1, retry_count))
+
+    return raw_cookie, user_agent
--- a/config.ini
+++ b/config.ini
@@ -17,7 +17,7 @@ naming_rule=number+'-'+title
 update_check=1

 [priority]
-website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321
+website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321,javlib

 [escape]
 literals=\()/
--- a/core.py
+++ b/core.py
@@ -15,6 +15,7 @@ import javbus
 import javdb
 import mgstage
 import xcity
+import javlib


 def escape_path(path, escape_literals: str):  # Remove escape literals
@@ -53,6 +54,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config):  # 从JSON
        "mgstage": mgstage.main,
        "jav321": jav321.main,
        "xcity": xcity.main,
+        "javlib": javlib.main,
    }

    # default fetch order list, from the beginning to the end
--- a/javlib.py
+++ b/javlib.py
@@ -0,0 +1,109 @@
+import json
+import bs4
+from bs4 import BeautifulSoup
+from lxml import html
+from http.cookies import SimpleCookie
+
+from ADC_function import get_javlib_cookie, get_html
+
+
+def main(number: str):
+    raw_cookies, user_agent = get_javlib_cookie()
+
+    # Blank cookies mean javlib site return error
+    if not raw_cookies:
+        return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+
+    # Manually construct a dictionary
+    s_cookie = SimpleCookie()
+    s_cookie.load(raw_cookies)
+    cookies = {}
+    for key, morsel in s_cookie.items():
+        cookies[key] = morsel.value
+
+    # Scraping
+    result = get_html(
+        "http://www.m45e.com/cn/vl_searchbyid.php?keyword={}".format(number),
+        cookies=cookies,
+        ua=user_agent,
+        return_type="object"
+    )
+    soup = BeautifulSoup(result.text, "html.parser")
+    lx = html.fromstring(str(soup))
+
+    if "/?v=jav" in result.url:
+        dic = {
+            "title": get_title(lx, soup),
+            "studio": get_table_el_single_anchor(soup, "video_maker"),
+            "year": get_table_el_td(soup, "video_date")[:4],
+            "outline": "",
+            "director": get_table_el_single_anchor(soup, "video_director"),
+            "cover": get_cover(lx),
+            "imagecut": 1,
+            "actor_photo": "",
+            "website": result.url,
+            "source": "javlib.py",
+            "actor": get_table_el_multi_anchor(soup, "video_cast"),
+            "label": get_table_el_td(soup, "video_label"),
+            "tag": get_table_el_multi_anchor(soup, "video_genres"),
+            "number": get_table_el_td(soup, "video_id"),
+            "release": get_table_el_td(soup, "video_date"),
+            "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
+        }
+    else:
+        dic = {}
+
+    return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+
+
+def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
+    return lx.xpath(xpath)[0].strip()
+
+
+def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
+    tag = soup.find(id=tag_id).find("a")
+
+    if tag is not None:
+        return tag.string.strip()
+    else:
+        return ""
+
+
+def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
+    tags = soup.find(id=tag_id).find_all("a")
+
+    return process(tags)
+
+
+def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
+    tags = soup.find(id=tag_id).find_all("td", class_="text")
+
+    return process(tags)
+
+
+def process(tags: bs4.element.ResultSet) -> str:
+    values = []
+    for tag in tags:
+        value = tag.string
+        if value is not None and value != "----":
+            values.append(value)
+
+    return ",".join(x for x in values if x)
+
+
+def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
+    title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
+    number = get_table_el_td(soup, "video_id")
+
+    return title.replace(number, "").strip()
+
+
+def get_cover(lx: html.HtmlComment) -> str:
+    return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
+
+
+if __name__ == "__main__":
+    # lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
+    lists = ["DVMC-003"]
+    for num in lists:
+        print(main(num))
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ lxml
 beautifulsoup4
 pillow
 pyinstaller
+cloudscraper