Add support javlib

2020-04-23 10:05:53 +09:00
parent 274891fc21
commit b65581de47
6 changed files with 190 additions and 19 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -25,11 +25,35 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install -r requirements.txt
-    - name: Build with pyinstaller
+#    - name: Show cloudsraper package location
 #      run: |
 #        python -c 'import cloudscraper as _; print(_.__path__)'
    - name: Build with pyinstaller (windows)
      if: matrix.os == 'windows-latest'
      run: |
-        pyinstaller --onefile AV_Data_Capture.py  --hidden-import ADC_function.py --hidden-import core.py
+        pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data='C:\\hostedtoolcache\\windows\\Python\\3.7.6\\x64\\lib\\site-packages\\cloudscraper\\;cloudscraper'
    - name: Build with pyinstaller (mac)
      if: matrix.os == 'macos-latest'
      run: |
        pyinstaller \
          --onefile AV_Data_Capture.py \
          --hidden-import ADC_function.py \
          --hidden-import core.py \
          --add-data='/Users/runner/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper'
    - name: Build with pyinstaller (ubuntu)
      if: matrix.os == 'ubuntu-latest'
      run: |
        pyinstaller \
          --onefile AV_Data_Capture.py \
          --hidden-import ADC_function.py \
          --hidden-import core.py \
          --add-data='/opt/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper'
    - name: Copy config.ini
      run: |
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -1,5 +1,6 @@
 import requests
 from lxml import etree
 import cloudscraper
 import config
@@ -23,23 +24,39 @@ def getXpathSingle(htmlcode,xpath):
    return result1
 def get_proxy(proxy: str) -> dict:
    if proxy:
        proxies = {"http": "http://" + proxy, "https": "https://" + proxy}
    else:
        proxies = {}
    return proxies
 # 网页请求核心
-def get_html(url, cookies=None):
+def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
    proxy, timeout, retry_count = config.Config().proxy()
    proxies = get_proxy(proxy)
    if ua is None:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"} # noqa
    else:
        headers = {"User-Agent": ua}
    for i in range(retry_count):
        try:
            if not proxy == '':
-                proxies = {"http": "http://" + proxy,"https": "https://" + proxy}
+                result = requests.get(str(url), headers=headers, timeout=timeout, proxies=proxies, cookies=cookies)
                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'}
                getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies)
                getweb.encoding = 'utf-8'
                return getweb.text
            else:
-                headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
+                result = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
-                getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies)
+
-                getweb.encoding = 'utf-8'
+            result.encoding = "utf-8"
-                return getweb.text
+
            if return_type == "object":
                return result
            else:
                return result.text
        except requests.exceptions.ProxyError:
            print("[-]Connect retry {}/{}".format(i + 1, retry_count))
    print('[-]Connect Failed! Please check your Proxy or Network!')
@@ -49,11 +66,7 @@ def get_html(url, cookies=None):
 def post_html(url: str, query: dict) -> requests.Response:
    proxy, timeout, retry_count = config.Config().proxy()
-
+    proxies = get_proxy(proxy)
    if proxy:
        proxies = {"http": "http://" + proxy, "https": "https://" + proxy}
    else:
        proxies = {}
    for i in range(retry_count):
        try:
@@ -64,3 +77,25 @@ def post_html(url: str, query: dict) -> requests.Response:
    print("[-]Connect Failed! Please check your Proxy or Network!")
    input("Press ENTER to exit!")
    exit()
 def get_javlib_cookie() -> [dict, str]:
    proxy, timeout, retry_count = config.Config().proxy()
    proxies = get_proxy(proxy)
    raw_cookie = {}
    user_agent = ""
    # Get __cfduid/cf_clearance and user-agent
    for i in range(retry_count):
        try:
            raw_cookie, user_agent = cloudscraper.get_cookie_string(
                "http://www.m45e.com/",
                proxies=proxies
            )
        except requests.exceptions.ProxyError:
            print("[-] ProxyError, retry {}/{}".format(i+1, retry_count))
        except cloudscraper.exceptions.CloudflareIUAMError:
            print("[-] IUAMError, retry {}/{}".format(i+1, retry_count))
    return raw_cookie, user_agent
--- a/config.ini
+++ b/config.ini
@@ -17,7 +17,7 @@ naming_rule=number+'-'+title
 update_check=1
 [priority]
-website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321
+website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321,javlib
 [escape]
 literals=\()/
--- a/core.py
+++ b/core.py
@@ -15,6 +15,7 @@ import javbus
 import javdb
 import mgstage
 import xcity
 import javlib
 def escape_path(path, escape_literals: str):  # Remove escape literals
@@ -53,6 +54,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config):  # 从JSON
        "mgstage": mgstage.main,
        "jav321": jav321.main,
        "xcity": xcity.main,
        "javlib": javlib.main,
    }
    # default fetch order list, from the beginning to the end
--- a/javlib.py
+++ b/javlib.py
@@ -0,0 +1,109 @@
 import json
 import bs4
 from bs4 import BeautifulSoup
 from lxml import html
 from http.cookies import SimpleCookie
 from ADC_function import get_javlib_cookie, get_html
 def main(number: str):
    raw_cookies, user_agent = get_javlib_cookie()
    # Blank cookies mean javlib site return error
    if not raw_cookies:
        return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
    # Manually construct a dictionary
    s_cookie = SimpleCookie()
    s_cookie.load(raw_cookies)
    cookies = {}
    for key, morsel in s_cookie.items():
        cookies[key] = morsel.value
    # Scraping
    result = get_html(
        "http://www.m45e.com/cn/vl_searchbyid.php?keyword={}".format(number),
        cookies=cookies,
        ua=user_agent,
        return_type="object"
    )
    soup = BeautifulSoup(result.text, "html.parser")
    lx = html.fromstring(str(soup))
    if "/?v=jav" in result.url:
        dic = {
            "title": get_title(lx, soup),
            "studio": get_table_el_single_anchor(soup, "video_maker"),
            "year": get_table_el_td(soup, "video_date")[:4],
            "outline": "",
            "director": get_table_el_single_anchor(soup, "video_director"),
            "cover": get_cover(lx),
            "imagecut": 1,
            "actor_photo": "",
            "website": result.url,
            "source": "javlib.py",
            "actor": get_table_el_multi_anchor(soup, "video_cast"),
            "label": get_table_el_td(soup, "video_label"),
            "tag": get_table_el_multi_anchor(soup, "video_genres"),
            "number": get_table_el_td(soup, "video_id"),
            "release": get_table_el_td(soup, "video_date"),
            "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
        }
    else:
        dic = {}
    return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
 def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
    return lx.xpath(xpath)[0].strip()
 def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
    tag = soup.find(id=tag_id).find("a")
    if tag is not None:
        return tag.string.strip()
    else:
        return ""
 def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
    tags = soup.find(id=tag_id).find_all("a")
    return process(tags)
 def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
    tags = soup.find(id=tag_id).find_all("td", class_="text")
    return process(tags)
 def process(tags: bs4.element.ResultSet) -> str:
    values = []
    for tag in tags:
        value = tag.string
        if value is not None and value != "----":
            values.append(value)
    return ",".join(x for x in values if x)
 def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
    title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
    number = get_table_el_td(soup, "video_id")
    return title.replace(number, "").strip()
 def get_cover(lx: html.HtmlComment) -> str:
    return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
 if __name__ == "__main__":
    # lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
    lists = ["DVMC-003"]
    for num in lists:
        print(main(num))
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ lxml
 beautifulsoup4
 pillow
 pyinstaller
 cloudscraper