From b65581de471a75223b465ea293a5e3601d3f7950 Mon Sep 17 00:00:00 2001 From: 68cdrBxM8YdoJ <68cdrBxM8YdoJ@gmail.com> Date: Thu, 23 Apr 2020 10:05:53 +0900 Subject: [PATCH] Add support javlib --- .github/workflows/main.yml | 28 +++++++++- ADC_function.py | 65 +++++++++++++++++----- config.ini | 2 +- core.py | 2 + javlib.py | 109 +++++++++++++++++++++++++++++++++++++ requirements.txt | 3 +- 6 files changed, 190 insertions(+), 19 deletions(-) create mode 100644 javlib.py diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index fee47d2..575d3bd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -25,11 +25,35 @@ jobs: - name: Install dependencies run: | + python -m pip install --upgrade pip pip install -r requirements.txt - - name: Build with pyinstaller +# - name: Show cloudsraper package location +# run: | +# python -c 'import cloudscraper as _; print(_.__path__)' + + - name: Build with pyinstaller (windows) + if: matrix.os == 'windows-latest' run: | - pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py + pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data='C:\\hostedtoolcache\\windows\\Python\\3.7.6\\x64\\lib\\site-packages\\cloudscraper\\;cloudscraper' + + - name: Build with pyinstaller (mac) + if: matrix.os == 'macos-latest' + run: | + pyinstaller \ + --onefile AV_Data_Capture.py \ + --hidden-import ADC_function.py \ + --hidden-import core.py \ + --add-data='/Users/runner/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper' + + - name: Build with pyinstaller (ubuntu) + if: matrix.os == 'ubuntu-latest' + run: | + pyinstaller \ + --onefile AV_Data_Capture.py \ + --hidden-import ADC_function.py \ + --hidden-import core.py \ + --add-data='/opt/hostedtoolcache/Python/3.7.6/x64/lib/python3.7/site-packages/cloudscraper/:cloudscraper' - name: Copy config.ini run: | diff --git a/ADC_function.py b/ADC_function.py index 8cf3156..481cb47 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -1,5 +1,6 @@ import requests from lxml import etree +import cloudscraper import config @@ -23,23 +24,39 @@ def getXpathSingle(htmlcode,xpath): return result1 +def get_proxy(proxy: str) -> dict: + if proxy: + proxies = {"http": "http://" + proxy, "https": "https://" + proxy} + else: + proxies = {} + + return proxies + + # 网页请求核心 -def get_html(url, cookies=None): +def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): proxy, timeout, retry_count = config.Config().proxy() + proxies = get_proxy(proxy) + + if ua is None: + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"} # noqa + else: + headers = {"User-Agent": ua} for i in range(retry_count): try: if not proxy == '': - proxies = {"http": "http://" + proxy,"https": "https://" + proxy} - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36'} - getweb = requests.get(str(url), headers=headers, timeout=timeout,proxies=proxies, cookies=cookies) - getweb.encoding = 'utf-8' - return getweb.text + result = requests.get(str(url), headers=headers, timeout=timeout, proxies=proxies, cookies=cookies) else: - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - getweb = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies) - getweb.encoding = 'utf-8' - return getweb.text + result = requests.get(str(url), headers=headers, timeout=timeout, cookies=cookies) + + result.encoding = "utf-8" + + if return_type == "object": + return result + else: + return result.text + except requests.exceptions.ProxyError: print("[-]Connect retry {}/{}".format(i + 1, retry_count)) print('[-]Connect Failed! Please check your Proxy or Network!') @@ -49,11 +66,7 @@ def get_html(url, cookies=None): def post_html(url: str, query: dict) -> requests.Response: proxy, timeout, retry_count = config.Config().proxy() - - if proxy: - proxies = {"http": "http://" + proxy, "https": "https://" + proxy} - else: - proxies = {} + proxies = get_proxy(proxy) for i in range(retry_count): try: @@ -64,3 +77,25 @@ def post_html(url: str, query: dict) -> requests.Response: print("[-]Connect Failed! Please check your Proxy or Network!") input("Press ENTER to exit!") exit() + + +def get_javlib_cookie() -> [dict, str]: + proxy, timeout, retry_count = config.Config().proxy() + proxies = get_proxy(proxy) + + raw_cookie = {} + user_agent = "" + + # Get __cfduid/cf_clearance and user-agent + for i in range(retry_count): + try: + raw_cookie, user_agent = cloudscraper.get_cookie_string( + "http://www.m45e.com/", + proxies=proxies + ) + except requests.exceptions.ProxyError: + print("[-] ProxyError, retry {}/{}".format(i+1, retry_count)) + except cloudscraper.exceptions.CloudflareIUAMError: + print("[-] IUAMError, retry {}/{}".format(i+1, retry_count)) + + return raw_cookie, user_agent diff --git a/config.ini b/config.ini index 96d7585..546d505 100644 --- a/config.ini +++ b/config.ini @@ -17,7 +17,7 @@ naming_rule=number+'-'+title update_check=1 [priority] -website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321 +website=javbus,javdb,fanza,xcity,mgstage,fc2,avsox,jav321,javlib [escape] literals=\()/ diff --git a/core.py b/core.py index 80f918d..0ec265a 100755 --- a/core.py +++ b/core.py @@ -15,6 +15,7 @@ import javbus import javdb import mgstage import xcity +import javlib def escape_path(path, escape_literals: str): # Remove escape literals @@ -53,6 +54,7 @@ def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON "mgstage": mgstage.main, "jav321": jav321.main, "xcity": xcity.main, + "javlib": javlib.main, } # default fetch order list, from the beginning to the end diff --git a/javlib.py b/javlib.py new file mode 100644 index 0000000..bdd7d10 --- /dev/null +++ b/javlib.py @@ -0,0 +1,109 @@ +import json +import bs4 +from bs4 import BeautifulSoup +from lxml import html +from http.cookies import SimpleCookie + +from ADC_function import get_javlib_cookie, get_html + + +def main(number: str): + raw_cookies, user_agent = get_javlib_cookie() + + # Blank cookies mean javlib site return error + if not raw_cookies: + return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) + + # Manually construct a dictionary + s_cookie = SimpleCookie() + s_cookie.load(raw_cookies) + cookies = {} + for key, morsel in s_cookie.items(): + cookies[key] = morsel.value + + # Scraping + result = get_html( + "http://www.m45e.com/cn/vl_searchbyid.php?keyword={}".format(number), + cookies=cookies, + ua=user_agent, + return_type="object" + ) + soup = BeautifulSoup(result.text, "html.parser") + lx = html.fromstring(str(soup)) + + if "/?v=jav" in result.url: + dic = { + "title": get_title(lx, soup), + "studio": get_table_el_single_anchor(soup, "video_maker"), + "year": get_table_el_td(soup, "video_date")[:4], + "outline": "", + "director": get_table_el_single_anchor(soup, "video_director"), + "cover": get_cover(lx), + "imagecut": 1, + "actor_photo": "", + "website": result.url, + "source": "javlib.py", + "actor": get_table_el_multi_anchor(soup, "video_cast"), + "label": get_table_el_td(soup, "video_label"), + "tag": get_table_el_multi_anchor(soup, "video_genres"), + "number": get_table_el_td(soup, "video_id"), + "release": get_table_el_td(soup, "video_date"), + "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), + } + else: + dic = {} + + return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) + + +def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: + return lx.xpath(xpath)[0].strip() + + +def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str: + tag = soup.find(id=tag_id).find("a") + + if tag is not None: + return tag.string.strip() + else: + return "" + + +def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str: + tags = soup.find(id=tag_id).find_all("a") + + return process(tags) + + +def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str: + tags = soup.find(id=tag_id).find_all("td", class_="text") + + return process(tags) + + +def process(tags: bs4.element.ResultSet) -> str: + values = [] + for tag in tags: + value = tag.string + if value is not None and value != "----": + values.append(value) + + return ",".join(x for x in values if x) + + +def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str: + title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()') + number = get_table_el_td(soup, "video_id") + + return title.replace(number, "").strip() + + +def get_cover(lx: html.HtmlComment) -> str: + return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src')) + + +if __name__ == "__main__": + # lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"] + lists = ["DVMC-003"] + for num in lists: + print(main(num)) diff --git a/requirements.txt b/requirements.txt index 8aa8934..435ca12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ pyquery lxml beautifulsoup4 pillow -pyinstaller \ No newline at end of file +pyinstaller +cloudscraper