From 3e849ddc4a3912a300727ee45ead1499af907fe4 Mon Sep 17 00:00:00 2001 From: Feng4 Date: Sat, 16 Jan 2021 18:33:27 +0800 Subject: [PATCH] Update javlib.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 解决javlib部分番号匹配不到问题 --- WebCrawler/javlib.py | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/WebCrawler/javlib.py b/WebCrawler/javlib.py index ff2c22d..1e1ba0d 100644 --- a/WebCrawler/javlib.py +++ b/WebCrawler/javlib.py @@ -2,6 +2,7 @@ import sys sys.path.append('../') import json import bs4 +import re from bs4 import BeautifulSoup from lxml import html from http.cookies import SimpleCookie @@ -32,7 +33,10 @@ def main(number: str): ) soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) - + + fanhao_pather = re.compile(r'
(.*?)
') + fanhao = fanhao_pather.findall(result.text) + if "/?v=jav" in result.url: dic = { "title": get_title(lx, soup), @@ -53,6 +57,41 @@ def main(number: str): "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), "series":'', } + elif number.upper() in fanhao: + url_pather = re.compile(r'
(.*?)
') + s = {} + url_list = url_pather.findall(result.text) + for url in url_list: + s[url[1]] = 'http://www.javlibrary.com/cn/' + url[0].lstrip('.') + av_url = s[number.upper()] + result = get_html( + av_url, + cookies=cookies, + ua=user_agent, + return_type="object" + ) + soup = BeautifulSoup(result.text, "html.parser") + lx = html.fromstring(str(soup)) + + dic = { + "title": get_title(lx, soup), + "studio": get_table_el_single_anchor(soup, "video_maker"), + "year": get_table_el_td(soup, "video_date")[:4], + "outline": "", + "director": get_table_el_single_anchor(soup, "video_director"), + "cover": get_cover(lx), + "imagecut": 1, + "actor_photo": "", + "website": result.url, + "source": "javlib.py", + "actor": get_table_el_multi_anchor(soup, "video_cast"), + "label": get_table_el_td(soup, "video_label"), + "tag": get_table_el_multi_anchor(soup, "video_genres"), + "number": get_table_el_td(soup, "video_id"), + "release": get_table_el_td(soup, "video_date"), + "runtime": get_from_xpath(lx, '//*[@id="video_length"]/table/tr/td[2]/span/text()'), + "series": '', + } else: dic = {}