Add support javlib

2020-04-23 10:05:53 +09:00
parent 274891fc21
commit b65581de47
6 changed files with 190 additions and 19 deletions
--- a/javlib.py
+++ b/javlib.py
@@ -0,0 +1,109 @@
+import json
+import bs4
+from bs4 import BeautifulSoup
+from lxml import html
+from http.cookies import SimpleCookie
+
+from ADC_function import get_javlib_cookie, get_html
+
+
+def main(number: str):
+    raw_cookies, user_agent = get_javlib_cookie()
+
+    # Blank cookies mean javlib site return error
+    if not raw_cookies:
+        return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+
+    # Manually construct a dictionary
+    s_cookie = SimpleCookie()
+    s_cookie.load(raw_cookies)
+    cookies = {}
+    for key, morsel in s_cookie.items():
+        cookies[key] = morsel.value
+
+    # Scraping
+    result = get_html(
+        "http://www.m45e.com/cn/vl_searchbyid.php?keyword={}".format(number),
+        cookies=cookies,
+        ua=user_agent,
+        return_type="object"
+    )
+    soup = BeautifulSoup(result.text, "html.parser")
+    lx = html.fromstring(str(soup))
+
+    if "/?v=jav" in result.url:
+        dic = {
+            "title": get_title(lx, soup),
+            "studio": get_table_el_single_anchor(soup, "video_maker"),
+            "year": get_table_el_td(soup, "video_date")[:4],
+            "outline": "",
+            "director": get_table_el_single_anchor(soup, "video_director"),
+            "cover": get_cover(lx),
+            "imagecut": 1,
+            "actor_photo": "",
+            "website": result.url,
+            "source": "javlib.py",
+            "actor": get_table_el_multi_anchor(soup, "video_cast"),
+            "label": get_table_el_td(soup, "video_label"),
+            "tag": get_table_el_multi_anchor(soup, "video_genres"),
+            "number": get_table_el_td(soup, "video_id"),
+            "release": get_table_el_td(soup, "video_date"),
+            "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
+        }
+    else:
+        dic = {}
+
+    return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
+
+
+def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
+    return lx.xpath(xpath)[0].strip()
+
+
+def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
+    tag = soup.find(id=tag_id).find("a")
+
+    if tag is not None:
+        return tag.string.strip()
+    else:
+        return ""
+
+
+def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
+    tags = soup.find(id=tag_id).find_all("a")
+
+    return process(tags)
+
+
+def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
+    tags = soup.find(id=tag_id).find_all("td", class_="text")
+
+    return process(tags)
+
+
+def process(tags: bs4.element.ResultSet) -> str:
+    values = []
+    for tag in tags:
+        value = tag.string
+        if value is not None and value != "----":
+            values.append(value)
+
+    return ",".join(x for x in values if x)
+
+
+def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
+    title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
+    number = get_table_el_td(soup, "video_id")
+
+    return title.replace(number, "").strip()
+
+
+def get_cover(lx: html.HtmlComment) -> str:
+    return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
+
+
+if __name__ == "__main__":
+    # lists = ["DVMC-003", "GS-0167", "JKREZ-001", "KMHRS-010", "KNSD-023"]
+    lists = ["DVMC-003"]
+    for num in lists:
+        print(main(num))