Fix source getchu

2022-05-24 22:34:34 +08:00
parent 26528b7e19
commit f4322cf427
1 changed files with 87 additions and 77 deletions
--- a/WebCrawler/getchu.py
+++ b/WebCrawler/getchu.py
@@ -3,26 +3,47 @@ sys.path.append('../')
 from ADC_function import *
 from WebCrawler.crawler import *
 import re
+import time
 from urllib.parse import quote

-def get_itemxxx_web(number):
-    getchu = Crawler(get_html("https://dl.getchu.com/i/" + number))
+JSON_HEADERS = {"Referer": "https://dl.getchu.com/"}
+COOKIES_DL = {"adult_check_flag": "1"}
+COOKIES_WWW = {'getchu_adalt_flag': 'getchu.com'}
+
+GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
+GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
+GETCHU_WWW_URL = 'http://www.getchu.com/soft.phtml?id=_WORD_'
+GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'
+
+def get_dl_getchu(number):
+    if "item" in number or 'GETCHU' in number.upper():
+        number = re.findall('\d+',number)[0]
+    else:
+        htmlcode = get_html(GETCHU_DL_SEARCH_URL.replace("_WORD_", number),
+                            json_headers=JSON_HEADERS, cookies=COOKIES_DL)
+        getchu = Crawler(htmlcode)
+        url = getchu.getString(
+            '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
+        if url == "":
+            return None
+        number = re.findall('\d+', url)[0]
+    htmlcode = get_html(GETCHU_DL_URL.replace("_WORD_", number), json_headers=JSON_HEADERS, cookies=COOKIES_DL)
+    getchu = Crawler(htmlcode)
    dic = {
        "title": getchu.getString("//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"),
        "cover": "https://dl.getchu.com" + getchu.getString("//td[contains(@bgcolor,'#ffffff')]/img/@src"),
-        "director": getchu.getString("//td[contains(text(),'作者')]/following-sibling::td/text()"),
-        "studio": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
-        "actor": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
-        "label": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
+        "director": getchu.getString("//td[contains(text(),'作者')]/following-sibling::td/text()").strip(),
+        "studio": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
+        "actor": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
+        "label": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
        "runtime": str(re.findall('\d+', str(getchu.getString(
            "//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()")))).strip(" ['']"),
-        "release": getchu.getString("//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/",
-                                                                                                           "-"),
+        "release": getchu.getString("//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-"),
        "tag": getchu.getStrings("//td[contains(text(),'趣向')]/following-sibling::td/a/text()"),
        "outline": getchu.getStrings("//*[contains(text(),'作品内容')]/following-sibling::td/text()"),
        "extrafanart": getchu.getStrings("//td[contains(@style,'background-color: #444444;')]/a/@href"),
        "series": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
-        "number": number,
+        "number": 'GETCHU-' + re.findall('\d+',number)[0],
        "imagecut": 4,
        "year": str(re.findall('\d{4}', str(getchu.getString(
            "//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-")))).strip(" ['']"),
@@ -36,78 +57,66 @@ def get_itemxxx_web(number):
        i = "https://dl.getchu.com" + i
        extrafanart.append(i)
    dic['extrafanart'] = extrafanart
+    time.sleep(1)
+    return dic
+
+def get_www_getchu(number):
+    number = quote(number, encoding="euc_jp")
+    getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
+    url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
+    if url2 == '':
+        getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
+        url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
+        if url2 == "":
+            return None
+    url2 = url2.replace('../', 'http://www.getchu.com/')
+    getchu = Crawler(get_html(url2, cookies=COOKIES_WWW))
+    dic = {
+        "title": getchu.getString('//*[@id="soft-title"]/text()').strip(),
+        "cover": "http://www.getchu.com" + getchu.getString(
+            "/html/body/div[1]/table[2]/tr[1]/td/a/@href").replace("./", '/'),
+        "director": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
+        "studio": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
+        "actor": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
+        "label": getchu.getString("//td[contains(text(),'ジャンル：')]/following-sibling::td/text()").strip(),
+        "runtime": '',
+        "release": getchu.getString("//td[contains(text(),'発売日：')]/following-sibling::td/a/text()").replace("/", "-").strip(),
+        "tag": getchu.getStrings("//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"),
+        "outline": getchu.getStrings("//div[contains(text(),'商品紹介')]/following-sibling::div/text()"),
+        "extrafanart": getchu.getStrings("//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"),
+        "series": getchu.getString("//td[contains(text(),'ジャンル：')]/following-sibling::td/text()").strip(),
+        "number": 'GETCHU-' + re.findall('\d+', url2.replace("http://www.getchu.com/soft.phtml?id=", ""))[0],
+        "imagecut": 0,
+        "year": str(re.findall('\d{4}', str(getchu.getString(
+            "//td[contains(text(),'発売日：')]/following-sibling::td/a/text()").replace("/", "-")))).strip(" ['']"),
+        "actor_photo": "",
+        "website": url2,
+        "headers": {'referer': url2},
+        "source": "getchu.py",
+        "allow_number_change": True,
+    }
+    extrafanart = []
+    for i in dic['extrafanart']:
+        i = "http://www.getchu.com" + i.replace("./", '/')
+        if 'jpg' in i:
+            extrafanart.append(i)
+    dic['extrafanart'] = extrafanart
+    time.sleep(1)
    return dic

 def main(number):
+    number = number.replace("-C", "")
+    dic = {}
    if "item" in number:
-        dic = get_itemxxx_web(number)
+        sort = ["get_dl_getchu(number)", "get_www_getchu(number)"]
    else:
-        display_number = number #quote(number,encoding="GBK")
-        htmlcode = get_html(f'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword={number}&'
-                            f'check_key_dtl=1&submit=',cookies={'getchu_adalt_flag':'getchu.com'})
-        getchu = Crawler(htmlcode)
-        url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
-        if url2 == '':
-            number = quote(number,encoding="euc_jp")
-            htmlcode = get_html(f'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword={number}'
-                                f'&check_key_dtl=1&submit=', cookies={'getchu_adalt_flag': 'getchu.com'})
-            getchu = Crawler(htmlcode)
-            url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
-        if "id=" in url2:
-            url2 = url2.replace('../', 'http://www.getchu.com/')
-            htmlcode = get_html(url2,cookies={'getchu_adalt_flag':'getchu.com'})
-            getchu = Crawler(htmlcode)
-            dic = {
-                "title": getchu.getString('//*[@id="soft-title"]/text()').strip(),
-                "cover": "http://www.getchu.com" + getchu.getString(
-                    "/html/body/div[1]/table[2]/tr[1]/td/a/@href").replace("./", '/'),
-                "director": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
-                "studio": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
-                "actor": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
-                "label": getchu.getString("//td[contains(text(),'ジャンル：')]/following-sibling::td/text()").strip(),
-                "runtime": '',
-                "release": getchu.getString("//td[contains(text(),'発売日：')]/following-sibling::td/a/text()").replace("/","-").strip(),
-                "tag": getchu.getStrings("//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"),
-                "outline": getchu.getStrings("//div[contains(text(),'商品紹介')]/following-sibling::div/text()"),
-                "extrafanart": getchu.getStrings("//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"),
-                "series": getchu.getString("//td[contains(text(),'ジャンル：')]/following-sibling::td/text()").strip(),
-                "number": display_number,
-                "imagecut": 0,
-                "year": str(re.findall('\d{4}', str(getchu.getString(
-                    "//td[contains(text(),'発売日：')]/following-sibling::td/a/text()").replace("/", "-")))).strip(" ['']"),
-                "actor_photo": "",
-                "website": url2,
-                "headers": {'referer': url2},
-                "source": "getchu.py",
-                "allow_number_change": True,
-            }
-            extrafanart = []
-            for i in dic['extrafanart']:
-                i = "http://www.getchu.com" + i.replace("./", '/')
-                if 'jpg' in i:
-                    extrafanart.append(i)
-            dic['extrafanart'] = extrafanart
-        else:
-            #number = quote(number, encoding="euc_jp")
-            htmlcode = get_html(f'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&'
-                                f'search_keyword={number}&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1',
-                                json_headers = {"Referer": "https://dl.getchu.com/"},cookies={"adult_check_flag":"1"})
-            getchu = Crawler(htmlcode)
-            url2 = getchu.getString('/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
-            if "i/item" in url2:
-                dic = get_itemxxx_web(re.findall('item\d+',url2)[0])
-            else:
-                number = quote(number, encoding="euc_jp")
-                htmlcode = get_html(f'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&'
-                                    f'search_keyword={number}&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1',
-                                    json_headers={"Referer": "https://dl.getchu.com/"},cookies={"adult_check_flag": "1"})
-                getchu = Crawler(htmlcode)
-                url2 = getchu.getString(
-                    '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
-                if "i/item" in url2:
-                    dic = get_itemxxx_web(re.findall('item\d+', url2)[0])
-                else:
-                    return {'title':''}
+        sort = ["get_www_getchu(number)", "get_dl_getchu(number)"]
+    for i in sort:
+        dic = eval(i)
+        if dic != None:
+            break
+    if dic == None:
+        return {"title" : ""}
    outline = ''
    _list = dic['outline']
    for i in _list:
@@ -118,6 +127,7 @@ def main(number):
    return result

 if __name__ == '__main__':
-    test = ['こすっち094','なちゅらるばけーしょん','item4039026']
+    test = []
    for i in test:
+        print(i)
        print(main(i))