From b7e08455824ff9220a81882794d922726a1de077 Mon Sep 17 00:00:00 2001 From: Max Zhao Date: Sun, 6 Sep 2020 16:57:58 +0800 Subject: [PATCH 1/3] [WebCrawler/javdb] cut cover as poster when gray image exists --- WebCrawler/javdb.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 362ab94..5beefa0 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -118,12 +118,15 @@ def main(number): correct_url = urls[ids.index(number)] detail_page = get_html('https://javdb.com' + correct_url) + # no cut image by default + imagecut = 3 # If gray image exists ,then replace with normal cover cover_small = getCover_small(query_result, index=ids.index(number)) if 'placeholder' in cover_small: + # replace wit normal cover and cut it + imagecut = 1 cover_small = getCover(detail_page) - dic = { 'actor': getActor(detail_page), 'title': getTitle(detail_page), @@ -135,7 +138,7 @@ def main(number): 'number': getNum(detail_page), 'cover': getCover(detail_page), 'cover_small': cover_small, - 'imagecut': 3, + 'imagecut': imagecut, 'tag': getTag(detail_page), 'label': getLabel(detail_page), 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), From 3d9c92aac537d95bfa1bcceeab0c6e2f408e5452 Mon Sep 17 00:00:00 2001 From: Max Zhao Date: Sun, 6 Sep 2020 17:36:17 +0800 Subject: [PATCH 2/3] [WebCrawler/javdb] remove actor when actor is 'N/A' --- WebCrawler/javdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 5beefa0..2e4924b 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -17,7 +17,7 @@ def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1] html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//strong[contains(text(),"演員")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"演員")]/../span/a/text()')).strip(" ['']") - return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').lstrip(',').replace(',', ', ') + return str(result1 + result2).strip('+').replace(",\\xa0", "").replace("'", "").replace(' ', '').replace(',,', '').replace('N/A', '').lstrip(',').replace(',', ', ') def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img a = actor.split(',') d={} From 42e646e92ca9d7cdfcce7d09c6924c384f229b9d Mon Sep 17 00:00:00 2001 From: Max Zhao Date: Sun, 6 Sep 2020 18:09:49 +0800 Subject: [PATCH 3/3] [WebCrawler/javdb] refine title value --- WebCrawler/javdb.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index 2e4924b..f8b1fdb 100644 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -127,15 +127,21 @@ def main(number): imagecut = 1 cover_small = getCover(detail_page) + number = getNum(detail_page) + title = getTitle(detail_page) + if title and number: + # remove duplicate title + title = title.replace(number, '').strip() + dic = { 'actor': getActor(detail_page), - 'title': getTitle(detail_page), + 'title': title, 'studio': getStudio(detail_page), 'outline': getOutline(detail_page), 'runtime': getRuntime(detail_page), 'director': getDirector(detail_page), 'release': getRelease(detail_page), - 'number': getNum(detail_page), + 'number': number, 'cover': getCover(detail_page), 'cover_small': cover_small, 'imagecut': imagecut, @@ -156,4 +162,4 @@ def main(number): # main('DV-1562') # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") if __name__ == "__main__": - print(main('snyz-007')) + print(main('GS-351'))