import sys sys.path.append('../') import json import bs4 import re from WebCrawler import airav from bs4 import BeautifulSoup from lxml import html from http.cookies import SimpleCookie from ADC_function import get_javlib_cookie, get_html def main(number: str): raw_cookies, user_agent = get_javlib_cookie() # Blank cookies mean javlib site return error if not raw_cookies: return json.dumps({}, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # Manually construct a dictionary s_cookie = SimpleCookie() s_cookie.load(raw_cookies) cookies = {} for key, morsel in s_cookie.items(): cookies[key] = morsel.value # Scraping result = get_html( "http://www.javlibrary.com/cn/vl_searchbyid.php?keyword={}".format(number), cookies=cookies, ua=user_agent, return_type="object" ) soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) fanhao_pather = re.compile(r'
(.*?)
') fanhao = fanhao_pather.findall(result.text) if "/?v=jav" in result.url: dic = { "title": get_title(lx, soup), "studio": get_table_el_single_anchor(soup, "video_maker"), "year": get_table_el_td(soup, "video_date")[:4], "outline": get_outline(number), "director": get_table_el_single_anchor(soup, "video_director"), "cover": get_cover(lx), "imagecut": 1, "actor_photo": "", "website": result.url, "source": "javlib.py", "actor": get_table_el_multi_anchor(soup, "video_cast"), "label": get_table_el_td(soup, "video_label"), "tag": get_table_el_multi_anchor(soup, "video_genres"), "number": get_table_el_td(soup, "video_id"), "release": get_table_el_td(soup, "video_date"), "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), "series":'', } elif number.upper() in fanhao: url_pather = re.compile(r'
(.*?)
') s = {} url_list = url_pather.findall(result.text) for url in url_list: s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.') av_url = s[number.upper()] result = get_html( av_url, cookies=cookies, ua=user_agent, return_type="object" ) soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) dic = { "title": get_title(lx, soup), "studio": get_table_el_single_anchor(soup, "video_maker"), "year": get_table_el_td(soup, "video_date")[:4], "outline": get_outline(number), "director": get_table_el_single_anchor(soup, "video_director"), "cover": get_cover(lx), "imagecut": 1, "actor_photo": "", "website": result.url, "source": "javlib.py", "actor": get_table_el_multi_anchor(soup, "video_cast"), "label": get_table_el_td(soup, "video_label"), "tag": get_table_el_multi_anchor(soup, "video_genres"), "number": get_table_el_td(soup, "video_id"), "release": get_table_el_td(soup, "video_date"), "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), "series": '', } else: dic = {"title": ""} return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) def get_from_xpath(lx: html.HtmlElement, xpath: str) -> str: return lx.xpath(xpath)[0].strip() def get_outline(number): try: response = json.loads(airav.main(number)) result = response['outline'] return result except: return '' def get_table_el_single_anchor(soup: BeautifulSoup, tag_id: str) -> str: tag = soup.find(id=tag_id).find("a") if tag is not None: return tag.string.strip() else: return "" def get_table_el_multi_anchor(soup: BeautifulSoup, tag_id: str) -> str: tags = soup.find(id=tag_id).find_all("a") return process(tags) def get_table_el_td(soup: BeautifulSoup, tag_id: str) -> str: tags = soup.find(id=tag_id).find_all("td", class_="text") return process(tags) def process(tags: bs4.element.ResultSet) -> str: values = [] for tag in tags: value = tag.string if value is not None and value != "----": values.append(value) return ",".join(x for x in values if x) def get_title(lx: html.HtmlElement, soup: BeautifulSoup) -> str: title = get_from_xpath(lx, '//*[@id="video_title"]/h3/a/text()') number = get_table_el_td(soup, "video_id") return title.replace(number, "").strip() def get_cover(lx: html.HtmlComment) -> str: return "http:{}".format(get_from_xpath(lx, '//*[@id="video_jacket_img"]/@src')) if __name__ == "__main__": lists = ["IPX-292", "STAR-438", "JKREZ-001", "KMHRS-010", "KNSD-023"] #lists = ["DVMC-003"] for num in lists: print(main(num))