Files
AV_Data_Capture/jav321.py
2020-03-25 19:30:59 +09:00

139 lines
3.2 KiB
Python

import json
from bs4 import BeautifulSoup
from lxml import html
from ADC_function import post_html
def main(number: str) -> json:
result = post_html(url="https://www.jav321.com/search", query={"sn": number})
soup = BeautifulSoup(result.text, "html.parser")
lx = html.fromstring(str(soup))
if "/video/" in result.url:
data = parse_info(soup)
dic = {
"title": get_title(lx),
"studio": "",
"year": get_year(data),
"outline": get_outline(lx),
"director": "",
"cover": get_cover(lx),
"imagecut": 1,
"actor_photo": "",
"website": result.url,
"source": "jav321.py",
**data,
}
else:
dic = {}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
def get_title(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[1]/h3/text()")[0].strip()
def parse_info(soup: BeautifulSoup) -> dict:
data = soup.select_one("div.row > div.col-md-9")
if data:
dd = str(data).split("<br/>")
data_dic = {}
for d in dd:
data_dic[get_bold_text(h=d)] = d
return {
"actor": get_actor(data_dic),
"label": get_label(data_dic),
"tag": get_tag(data_dic),
"number": get_number(data_dic),
"release": get_release(data_dic),
"runtime": get_runtime(data_dic),
}
else:
return {}
def get_bold_text(h: str) -> str:
soup = BeautifulSoup(h, "html.parser")
if soup.b:
return soup.b.text
else:
return "UNKNOWN_TAG"
def get_anchor_info(h: str) -> str:
result = []
data = BeautifulSoup(h, "html.parser").find_all("a", href=True)
for d in data:
result.append(d.text)
return ",".join(result)
def get_text_info(h: str) -> str:
return h.split(": ")[1]
def get_cover(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[2]/div[1]/p/a/img/@src")[0]
def get_outline(lx: html.HtmlElement) -> str:
return lx.xpath("/html/body/div[2]/div[1]/div[1]/div[2]/div[3]/div/text()")[0]
def get_actor(data: hash) -> str:
if "女优" in data:
return get_anchor_info(data["女优"])
else:
return ""
def get_label(data: hash) -> str:
if "片商" in data:
return get_anchor_info(data["片商"])
else:
return ""
def get_tag(data: hash) -> str:
if "标签" in data:
return get_anchor_info(data["标签"])
else:
return ""
def get_number(data: hash) -> str:
if "番号" in data:
return get_text_info(data["番号"])
else:
return ""
def get_release(data: hash) -> str:
if "发行日期" in data:
return get_text_info(data["发行日期"])
else:
return ""
def get_runtime(data: hash) -> str:
if "播放时长" in data:
return get_text_info(data["播放时长"])
else:
return ""
def get_year(data: hash) -> str:
if "release" in data:
return data["release"][:4]
else:
return ""
if __name__ == "__main__":
print(main("wmc-002"))