Files
AV_Data_Capture/WebCrawler/javlib.py
lededev 701cc954cb bugfix
2021-11-14 08:52:45 +08:00

162 lines
5.1 KiB
Python

import sys
sys.path.append('../')
import json
import bs4
import re
from WebCrawler import airav
from bs4 import BeautifulSoup
from lxml import html
from http.cookies import SimpleCookie
from ADC_function import get_javlib_cookie, get_html
def main(number: str):
raw_cookies, user_agent = get_javlib_cookie()
# Blank cookies mean javlib site return error
if not raw_cookies:
return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
# Manually construct a dictionary
s_cookie = SimpleCookie()
s_cookie.load(raw_cookies)
cookies = {}
for key, morsel in s_cookie.items():
cookies[key] = morsel.value
# Scraping
result = get_html(
"http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number),
cookies=cookies,
ua=user_agent,
return_type="object"
)
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
fanhao_pather = re.compile(r'<a href=".*?".*?><div class="id">(.*?)</div>')
fanhao = fanhao_pather.findall(result.text)
if "/?v=jav" in result.url:
dic = {
"title": get_title(lx, soup),
"studio": get_table_el_single_anchor(soup, "video_maker"),
"year": get_table_el_td(soup, "video_date")[:4],
"outline": get_outline(number),
"director": get_table_el_single_anchor(soup, "video_director"),
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "javlib.py",
"actor": get_table_el_multi_anchor(soup, "video_cast"),
"label": get_table_el_td(soup, "video_label"),
"tag": get_table_el_multi_anchor(soup, "video_genres"),
"number": get_table_el_td(soup, "video_id"),
"release": get_table_el_td(soup, "video_date"),
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
"series":'',
}
elif number.upper() in fanhao:
url_pather = re.compile(r'<a href="(.*?)".*?><div class="id">(.*?)</div>')
s = {}
url_list = url_pather.findall(result.text)
for url in url_list:
s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.')
av_url = s[number.upper()]
result = get_html(
av_url,
cookies=cookies,
ua=user_agent,
return_type="object"
)
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
dic = {
"title": get_title(lx, soup),
"studio": get_table_el_single_anchor(soup, "video_maker"),
"year": get_table_el_td(soup, "video_date")[:4],
"outline": get_outline(number),
"director": get_table_el_single_anchor(soup, "video_director"),
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "javlib.py",
"actor": get_table_el_multi_anchor(soup, "video_cast"),
"label": get_table_el_td(soup, "video_label"),
"tag": get_table_el_multi_anchor(soup, "video_genres"),
"number": get_table_el_td(soup, "video_id"),
"release": get_table_el_td(soup, "video_date"),
"runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'),
"series": '',
}
else:
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str:
return lx.xpath(xpath)[0].strip()
def get_outline(number):
try:
response = json.loads(airav.main(number))
result = response['outline']
return result
except:
return ''
def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str:
tag = soup.find(id=tag_id).find("a")
if tag is not None:
return tag.string.strip()
else:
return ""
def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str:
tags = soup.find(id=tag_id).find_all("a")
return process(tags)
def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str:
tags = soup.find(id=tag_id).find_all("td", class_="text")
return process(tags)
def process(tags: bs4.element.ResultSet) -> str:
values = []
for tag in tags:
value = tag.string
if value is not None and value != "----":
values.append(value)
return ",".join(x for x in values if x)
def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str:
title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()')
number = get_table_el_td(soup, "video_id")
return title.replace(number, "").strip()
def get_cover(lx: html.HtmlComment) -> str:
return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src'))
if __name__ == "__main__":
lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"]
#lists = ["DVMC-003"]
for num in lists:
print(main(num))