From 3042001df522083498d34e9d1a2835cbe4845cc7 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 9 May 2021 12:23:21 +0800 Subject: [PATCH] javdb enable user login cookies --- ADC_function.py | 26 +++++++++++++++++++++++++- WebCrawler/javdb.py | 20 +++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/ADC_function.py b/ADC_function.py index 79b985d..0d29ea7 100644 --- a/ADC_function.py +++ b/ADC_function.py @@ -545,4 +545,28 @@ def is_uncensored(number): for pre in prefix_list: if pre.upper() in number.upper(): return True - return False \ No newline at end of file + return False + +# 从浏览器中导出网站登录验证信息的cookies,能够以会员方式打开游客无法访问到的页面 +# 示例: FC2-755670 url https://javdb9.com/v/vO8Mn +# json 文件格式 +# 文件名: 站点名.json,示例 javdb9.json +# 内容(文件编码:UTF-8): +''' +{ + "over18":"1", + "redirect_to":"%2Fv%2FvO8Mn", + "remember_me_token":"cbJdeaFpbHMiOnsibWVzc2FnZSI6IklrNVJjbTAzZFVSRVlVaEtPWEpUVFhOVU0yNXhJZz09IiwiZXhwIjoiMjAyMS0wNS0xNVQxMzoyODoxNy4wMDBaIiwicHVyIjoiY29va2llLnJlbWVtYmVyX21lX3Rva2VuIn19--a7131611e844cf75f9db4cd411b635889bff3fe3", + "_jdb_session":"asddefqfwfwwrfdsdaAmqKj1%2FvOrDQP4b7h%2BvGp7brvIShi2Y%2FHBUr%2BklApk06TfhBOK3g5gRImZzoi49GINH%2FK49o3W%2FX64ugBiUAcudN9b27Mg6Ohu%2Bx9Z7A4bbqmqCt7XR%2Bao8PRuOjMcdDG5czoYHJCPIPZQFU28Gd7Awc2jc5FM5CoIgSRyaYDy9ulTO7DlavxoNL%2F6OFEL%2FyaA6XUYTB2Gs1kpPiUDqwi854mo5%2FrNxMhTeBK%2BjXciazMtN5KlE5JIOfiWAjNrnx7SV3Hj%2FqPNxRxXFQyEwHr5TZa0Vk1%2FjbwWQ0wcIFfh%2FMLwwqKydAh%2FLndc%2Bmdv3e%2FJ%2BiL2--xhqYnMyVRlxJajdN--u7nl0M7Oe7tZtPd4kIaEbg%3D%3D", + "locale":"zh", + "__cfduid":"dee27116d98c432a5cabc1fe0e7c2f3c91620479752", + "theme":"auto" +} +''' +# 从网站登录后,通过浏览器插件(CookieBro或EdittThisCookie)或者直接在地址栏网站链接信息处都可以复制或者导出cookie内容, +# 并填写到以上json文件的相应字段中 +def load_cookies(filename): + try: + return json.load(open(filename)) + except: + return None diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 678dbc5..78c2aeb 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -44,7 +44,7 @@ def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img else: return {} - + def getStudio(a): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") @@ -57,7 +57,7 @@ def getStudio(a): else: result = "" return result - + def getRuntime(a): html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") @@ -189,6 +189,9 @@ def getSeries(a): result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') + +javdb_site = "javdb9" + def main(number): try: # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): @@ -196,10 +199,12 @@ def main(number): # else: # number = number.upper() number = number.upper() + javdb_cookies = load_cookies(javdb_site + ".json") try: - query_result = get_html('https://javdb8.com/search?q=' + number + '&f=all') + javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' + query_result = get_html(javdb_url, cookies=javdb_cookies) except: - query_result = get_html('https://javdb.com/search?q=' + number + '&f=all') + query_result = get_html('https://javdb8.com/search?q=' + number + '&f=all') html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # javdb sometime returns multiple results, # and the first elememt maybe not the one we are looking for @@ -211,7 +216,11 @@ def main(number): else: ids =html.xpath('//*[@id="videos"]/div/div/a/div[contains(@class, "uid")]/text()') correct_url = urls[ids.index(number)] - detail_page = get_html('https://javdb8.com' + correct_url) + try: + javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url + detail_page = get_html(javdb_detail_url, cookies=javdb_cookies) + except: + detail_page = get_html('https://javdb8.com' + correct_url) # no cut image by default imagecut = 3 @@ -266,3 +275,4 @@ if __name__ == "__main__": # print(main('blacked.20.05.30')) # print(main('AGAV-042')) print(main('BANK-022')) + print(main('FC2-735670'))