From e7ad2b085a1d7bcad4016a8ab95ba2cb2e41a7a5 Mon Sep 17 00:00:00 2001 From: Wait Date: Wed, 19 Aug 2020 03:07:56 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=20AVSOX=20=E9=87=8D=E5=AE=9A?= =?UTF-8?q?=E5=90=91=E5=88=B0=E5=8F=91=E5=B8=83=E9=A1=B5=E7=9A=84=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AVSOX 硬编码的地址有时候会 302 跳转到发布页,不如直接从发布页获取最新地址。 --- WebCrawler/avsox.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index f15b9b4..82c0b86 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -83,15 +83,17 @@ def getSeries(htmlcode): return '' def main(number): - a = get_html('https://avsox.host/cn/search/' + number) + html = get_html('https://tellme.pw/avsox') + site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0] + a = get_html(site + '/cn/search/' + number) html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_')) + a = get_html(site + '/cn/search/' + number.replace('-', '_')) html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': - a = get_html('https://avsox.host/cn/search/' + number.replace('_', '')) + a = get_html(site + '/cn/search/' + number.replace('_', '')) html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") web = get_html(result1) @@ -121,4 +123,4 @@ def main(number): return js if __name__ == "__main__": - print(main('012717_472')) \ No newline at end of file + print(main('012717_472'))