From f8dc05a38bad656a5d5ed186ea84ad0cce2ebc43 Mon Sep 17 00:00:00 2001 From: lededev Date: Tue, 12 Oct 2021 11:28:17 +0800 Subject: [PATCH] improve javbus and javdb outline source --- ADC_function.py | 2 +- WebCrawler/javbus.py | 12 +++++++++++- WebCrawler/javdb.py | 1 - WebCrawler/xcity.py | 13 ++++++++----- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 4480852..ed428bd 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -2,7 +2,7 @@ from os import replace import requests import hashlib from pathlib import Path -#import secrets +import secrets import os.path import uuid import json diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index c2ff11e..e739424 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -93,8 +93,12 @@ def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时 return '' def getOutline(number): #获取剧情介绍 从avno1.cc取得 try: + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), + '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 number_up = number.upper() - result, browser = get_html_by_form('http://www.avno1.cc/cn/usercenter.php?item=pay_support', + result, browser = get_html_by_form(url, form_select='div.wrapper > div.header > div.search > form', fields = {'kw' : number_up}, return_type = 'browser') @@ -107,6 +111,12 @@ def getOutline(number): #获取剧情介绍 从avno1.cc取得 return browser.page.select('div.type_movie > div > ul > li:nth-child(1) > div')[0]['data-description'].strip() except: pass + from WebCrawler.xcity import open_by_browser, getOutline as xcity_getOutline + try: + detail_html, browser = open_by_browser(number_up) + return xcity_getOutline(detail_html) + except: + pass return '' def getSerise(htmlcode): #获取系列 已修改 html = etree.fromstring(htmlcode, etree.HTMLParser()) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 358682d..4b0d4c9 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -5,7 +5,6 @@ from lxml import etree import json from bs4 import BeautifulSoup from ADC_function import * -import secrets # import sys # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index 858dd54..4bbdec1 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -181,11 +181,10 @@ def getExtrafanart(htmlcode): # 获取剧照 return s return '' -def main(number): - try: +def open_by_browser(number): xcity_number = number.replace('-','') query_result, browser = get_html_by_form( - 'https://xcity.jp/about/', + 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), fields = {'q' : xcity_number.lower()}, return_type = 'browser') if not query_result or not query_result.ok: @@ -193,12 +192,16 @@ def main(number): result = browser.follow_link(browser.links('avod\/detail')[0]) if not result.ok: raise ValueError("xcity.py: detail page not found") - detail_page = str(browser.page) + return str(browser.page), browser + +def main(number): + try: + detail_page, browser = open_by_browser(number) url = browser.url newnum = getNum(detail_page).upper() number_up = number.upper() if newnum != number_up: - if newnum == xcity_number.upper(): + if newnum == number.replace('-','').upper(): newnum = number_up else: raise ValueError("xcity.py: number not found")