from natsort import natsorted, ns
from pathlib import Path
import json
import os
import requests

input_file = 'INPUT_FILE_NAME.json'
page_offset = 0

if not os.path.exists(input_file):
    quit()


def kanji_from_data(data):

    if not 'conj' in data and not 'components' in data:
        return data['text']

    if 'conj' in data:
        if 0 == len(data['conj']):
            return data['text']

        if 'reading' in data['conj'][0]:
            if not ' ' in data['conj'][0]['reading']:
                return data['conj'][0]['reading']
            elif data['text'] == data['kana']:
                # Return the kana.
                return data['conj'][0]['reading'].split(' ')[1].replace('【', '').replace('】', '')
            else:
                # Return the kanji.
                return data['conj'][0]['reading'].split(' ')[0]
        if 'via' in data['conj'][0]:
            if not ' ' in data['conj'][0]['via'][0]['reading']:
                return data['conj'][0]['via'][0]['reading']
            elif data['text'] == data['kana']:
                # Return the kana.
                return data['conj'][0]['via'][0]['reading'].split(' ')[1].replace('【', '').replace('】', '')
            else:
                # Return the kanji.
                return data['conj'][0]['via'][0]['reading'].split(' ')[0]
    elif 'components' in data:
        return data['components'][0]['text']
    else:
        return 'WORD_PENDING'


def kana_from_data(data):

    if not 'conj' in data and not 'components' in data:
        return data['kana']

    if 'conj' in data:
        if 0 == len(data['conj']):
            return data['kana']

        if 'reading' in data['conj'][0]:
            if not ' ' in data['conj'][0]['reading']:
                return data['conj'][0]['reading']
            else:
                # Return the kana.
                return data['conj'][0]['reading'].split(' ')[1].replace('【', '').replace('】', '')
        if 'via' in data['conj'][0]:
            if not ' ' in data['conj'][0]['via'][0]['reading']:
                return data['conj'][0]['via'][0]['reading']
            else:
                # Return the kana.
                return data['conj'][0]['via'][0]['reading'].split(' ')[1].replace('【', '').replace('】', '')
    elif 'components' in data:
        return data['components'][0]['kana']
    else:
        return 'WORD_PENDING'


def pos_from_data(data):
    # Note: We're grabbing the first part of speech here, which may not be accurate in context.  Since this is only used
    # to filter out certain parts of speech, it would probably be best to return all parts of speech and then filter the
    # word out from results if none of the parts of speech are valid.  For now, look only at the first one.

    if 'gloss' in data:
        return data['gloss'][0]['pos']

    if 'conj' in data:
        if 'gloss' in data['conj'][0] and data['conj'][0]['gloss']:
            return data['conj'][0]['gloss'][0]['pos']
        if 'prop' in data['conj'][0]:
            # Does this never have [] around it, or was 聞く a special case?
            return f"[{data['conj'][0]['prop'][0]['pos']}]*"

    if 'components' in data and 'gloss' in data['components'][0]:
        return f"[{data['components'][0]['gloss'][0]['pos']}]"

    return []


def gloss_from_data(data):

    if 'conj' in data and not len(data['conj']) == 0:
        if 'gloss' in data['conj'][0] and data['conj'][0]['gloss']:
            return data['conj'][0]['gloss'][0]['gloss']
        if 'via' in data['conj'][0] and data['conj'][0]['via']:
            return data['conj'][0]['via'][0]['gloss'][0]['gloss']

    if 'gloss' in data:
        return data['gloss'][0]['gloss']

    if 'components' in data and 'gloss' in data['components'][0]:
        return data['components'][0]['gloss'][0]['gloss']

    return ''


def show_output(component, page, row_number):
    kanji = kanji_from_data(component)
    kana = kana_from_data(component)
    speech = pos_from_data(component)
    gloss = gloss_from_data(component)

    if '[]' == speech or [] == speech:
        return
    elif 'prt' in speech:
        return
    elif 'pref' in speech:
        return
    elif '[aux]' == speech or '[aux-adj]' == speech or '[aux,adj-na]' == speech or "[aux-v]" == speech or "[aux-v,vr]" == speech or "[v5aru,aux-v]" == speech or "[aux-v,v5r]" == speech:
        return
    elif 'suf' in speech:
        return
    elif 'conj' in speech:
        return
    elif 'cop' in speech:
        return
    elif 'int' in speech and not 'exp' in speech:
        return
    elif 'ctr' in speech:
        return

    print(f'{kanji}\t{kana}\t{speech}\t{gloss}\t{row_number}\t{page}')


row_number = 1

pages = json.loads(Path(input_file).read_text())
for page in natsorted(pages.items()):
    page_number = int(page[0]) + page_offset
    page_lines = page[1].strip().split()

    for line in page_lines:
        if not line.strip():
            continue

        result = requests.post('http://localhost:3005/segmentation', json={"text": line.strip()})

        for section in result.json():
            for section2 in section:

                if 1 == len(section2):
                    continue

                for section4 in section2[0]:
                    if 1 == len(section4):
                        continue
                    if 'alternative' in section4[1]:
                        continue
                    if not 'components' in section4[1]:
                        show_output(section4[1], page_number, row_number)
                        row_number += 1
                        continue
                    for component in section4[1]['components']:
                        if 'suffix' in component:
                            continue
                        show_output(section4[1], page_number, row_number)
                        row_number += 1