AV_Data_Capture/WebCrawler/getchu.py

import sys
sys.path.append('../')
from ADC_function import *
from WebCrawler.crawler import *
import re
import time
from urllib.parse import quote

JSON_HEADERS = {"Referer": "https://dl.getchu.com/"}
COOKIES_DL = {"adult_check_flag": "1"}
COOKIES_WWW = {'getchu_adalt_flag': 'getchu.com'}

GETCHU_WWW_SEARCH_URL = 'http://www.getchu.com/php/search.phtml?genre=anime_dvd&search_keyword=_WORD_&check_key_dtl=1&submit='
GETCHU_DL_SEARCH_URL = 'https://dl.getchu.com/search/search_list.php?dojin=1&search_category_id=&search_keyword=_WORD_&btnWordSearch=%B8%A1%BA%F7&action=search&set_category_flag=1'
GETCHU_WWW_URL = 'http://www.getchu.com/soft.phtml?id=_WORD_'
GETCHU_DL_URL = 'https://dl.getchu.com/i/item_WORD_'

def get_dl_getchu(number):
    if "item" in number or 'GETCHU' in number.upper():
        number = re.findall('\d+',number)[0]
    else:
        htmlcode = get_html(GETCHU_DL_SEARCH_URL.replace("_WORD_", number),
                            json_headers=JSON_HEADERS, cookies=COOKIES_DL)
        getchu = Crawler(htmlcode)
        url = getchu.getString(
            '/html/body/div[1]/table/tr/td/table[4]/tr/td[2]/table/tr[2]/td/table/tr/td/table/tr/td[2]/div/a[1]/@href')
        if url == "":
            return None
        number = re.findall('\d+', url)[0]
    htmlcode = get_html(GETCHU_DL_URL.replace("_WORD_", number), json_headers=JSON_HEADERS, cookies=COOKIES_DL)
    getchu = Crawler(htmlcode)
    dic = {
        "title": getchu.getString("//div[contains(@style,'color: #333333; padding: 3px 0px 0px 5px;')]/text()"),
        "cover": "https://dl.getchu.com" + getchu.getString("//td[contains(@bgcolor,'#ffffff')]/img/@src"),
        "director": getchu.getString("//td[contains(text(),'作者')]/following-sibling::td/text()").strip(),
        "studio": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
        "actor": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
        "label": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()").strip(),
        "runtime": str(re.findall('\d+', str(getchu.getString(
            "//td[contains(text(),'画像数&ページ数')]/following-sibling::td/text()")))).strip(" ['']"),
        "release": getchu.getString("//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-"),
        "tag": getchu.getStrings("//td[contains(text(),'趣向')]/following-sibling::td/a/text()"),
        "outline": getchu.getStrings("//*[contains(text(),'作品内容')]/following-sibling::td/text()"),
        "extrafanart": getchu.getStrings("//td[contains(@style,'background-color: #444444;')]/a/@href"),
        "series": getchu.getString("//td[contains(text(),'サークル')]/following-sibling::td/a/text()"),
        "number": 'GETCHU-' + re.findall('\d+',number)[0],
        "imagecut": 4,
        "year": str(re.findall('\d{4}', str(getchu.getString(
            "//td[contains(text(),'配信開始日')]/following-sibling::td/text()").replace("/", "-")))).strip(" ['']"),
        "actor_photo": "",
        "website": "https://dl.getchu.com/i/" + number,
        "source": "getchu.py",
        "allow_number_change": True,
    }
    extrafanart = []
    for i in dic['extrafanart']:
        i = "https://dl.getchu.com" + i
        extrafanart.append(i)
    dic['extrafanart'] = extrafanart
    time.sleep(1)
    return dic

def get_www_getchu(number):
    number = quote(number, encoding="euc_jp")
    getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
    url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
    if url2 == '':
        getchu = Crawler(get_html(GETCHU_WWW_SEARCH_URL.replace("_WORD_", number), cookies=COOKIES_WWW))
        url2 = getchu.getString('//*[@id="detail_block"]/div/table/tr[1]/td/a[1]/@href')
        if url2 == "":
            return None
    url2 = url2.replace('../', 'http://www.getchu.com/')
    getchu = Crawler(get_html(url2, cookies=COOKIES_WWW))
    dic = {
        "title": getchu.getString('//*[@id="soft-title"]/text()').strip(),
        "cover": "http://www.getchu.com" + getchu.getString(
            "/html/body/div[1]/table[2]/tr[1]/td/a/@href").replace("./", '/'),
        "director": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()"),
        "studio": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
        "actor": getchu.getString("//td[contains(text(),'ブランド')]/following-sibling::td/a[1]/text()").strip(),
        "label": getchu.getString("//td[contains(text(),'ジャンル：')]/following-sibling::td/text()").strip(),
        "runtime": '',
        "release": getchu.getString("//td[contains(text(),'発売日：')]/following-sibling::td/a/text()").replace("/", "-").strip(),
        "tag": getchu.getStrings("//td[contains(text(),'カテゴリ')]/following-sibling::td/a/text()"),
        "outline": getchu.getStrings("//div[contains(text(),'商品紹介')]/following-sibling::div/text()"),
        "extrafanart": getchu.getStrings("//div[contains(text(),'サンプル画像')]/following-sibling::div/a/@href"),
        "series": getchu.getString("//td[contains(text(),'ジャンル：')]/following-sibling::td/text()").strip(),
        "number": 'GETCHU-' + re.findall('\d+', url2.replace("http://www.getchu.com/soft.phtml?id=", ""))[0],
        "imagecut": 0,
        "year": str(re.findall('\d{4}', str(getchu.getString(
            "//td[contains(text(),'発売日：')]/following-sibling::td/a/text()").replace("/", "-")))).strip(" ['']"),
        "actor_photo": "",
        "website": url2,
        "headers": {'referer': url2},
        "source": "getchu.py",
        "allow_number_change": True,
    }
    extrafanart = []
    for i in dic['extrafanart']:
        i = "http://www.getchu.com" + i.replace("./", '/')
        if 'jpg' in i:
            extrafanart.append(i)
    dic['extrafanart'] = extrafanart
    time.sleep(1)
    return dic

def main(number):
    number = number.replace("-C", "")
    dic = {}
    if "item" in number:
        sort = ["get_dl_getchu(number)", "get_www_getchu(number)"]
    else:
        sort = ["get_www_getchu(number)", "get_dl_getchu(number)"]
    for i in sort:
        dic = eval(i)
        if dic != None:
            break
    if dic == None:
        return {"title" : ""}
    outline = ''
    _list = dic['outline']
    for i in _list:
        outline = outline + i
    dic['outline'] = outline

    result = json.dumps(dic,ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), )
    return result

if __name__ == '__main__':
    test = []
    for i in test:
        print(i)
        print(main(i))