29
loading...
This website collects cookies to deliver better user experience
<ruby><rb>辞</rb><rt>や</rt>めたい</ruby>
<ruby><rb>辿</rb><rt>たど</rt>り</ruby><ruby><rb>着</rb><rt>つ</rt>く</ruby>
pip install sudachipy sudachidict_small wanakana-python
from sudachipy import tokenizer
from sudachipy import dictionary
tokenizer_obj = dictionary.Dictionary(dict_type="small").create()
def add_furigana(text):
tokens = [m for m in tokenizer_obj.tokenize(text, tokenizer.Tokenizer.SplitMode.C)]
parsed = ''
# ...
return parsed
from wanakana import to_hiragana, is_japanese, is_katakana, is_hiragana
import string
JAPANESE_PUNCTUATION = ' 〜!?。、():「」『』0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
def is_japanese_extended(text):
return is_japanese(text) and text not in string.punctuation and text not in JAPANESE_PUNCTUATION
SPECIAL_CHARACTERS = '〜'
def add_furigana(text):
tokens = [m for m in tokenizer_obj.tokenize(text, tokenizer.Tokenizer.SplitMode.C)]
parsed = ''
for index, token in enumerate(tokens):
to_parse = is_japanese_extended(token.surface()) and not is_katakana(token.surface()) and not is_hiragana(token.surface())
if to_parse:
if token.surface()[-1] in SPECIAL_CHARACTERS:
parsed += add_furigana(token.surface()[:-1]) + token.surface()[-1]
else:
if index > 0:
parsed += ' '
reading = to_hiragana(token.reading_form())
# ...
else:
parsed += token.surface()
return parsed
KANJI_READING_MAPPING = {
'私': '私[わたし]',
'貴女': '貴女[あなた]',
'何が': '何[なに]が',
'何を': '何[なに]を',
'我国': '我国[わがくに]',
'行き来': '行[い]き 来[き]',
'外宇宙': '外宇宙[がいうちゅう]',
'異星人': '異星人[いせいじん]',
'優那': '優那[ゆうな]',
'菜々美': '菜々美[ななみ]'
}
We can also use a JSON file for easier data management but here we will keep it contained in the Python file.
token.surface() + tokens[index+1].surface()
are also a key in our mapping dictionary.token_indexes_to_skip = []
for index, token in enumerate(tokens):
if index in token_indexes_to_skip:
continue
# ...
if index < len(tokens)-1 and token.surface() + tokens[index+1].surface() in KANJI_READING_MAPPING:
parsed += KANJI_READING_MAPPING[tokens[index].surface() + tokens[index+1].surface()]
token_indexes_to_skip.append(index+1)
elif token.surface() in KANJI_READING_MAPPING:
parsed += KANJI_READING_MAPPING[token.surface()]
else:
# ...
We add a list token_indexes_to_skip to store the index+1 tokens that are already used for the custom mapping. This way we can skip parsing it in the next iteration.
For example, the surface form 教える has the reading form オシエル and subsequently おしえる after Hiragana conversion.
By matching え in おしえる to 教える we can parse the reading of 教 to おし. reading_index's value is 0 and reading_index_tail's value is 1
For example 可愛い is parsed as 可愛[かわい]い instead of 可愛[かわ]いい. By checking the repetition of the final い we could force the first い to be included in the reading.
surface_index = 0
reading_index = 0
while len(token.surface()) > surface_index:
if is_hiragana(token.surface()[surface_index]) or is_katakana(token.surface()[surface_index]):
parsed += token.surface()[surface_index]
reading_index += 1
surface_index += 1
else:
next_index = -1
for token_index in range(surface_index, len(token.surface())):
if is_hiragana(token.surface()[token_index]) or is_katakana(token.surface()[token_index]):
next_index = token_index
break
if next_index < 0:
parsed += to_anki_format(
index=surface_index, kanji=token.surface()[surface_index:],
reading=reading[reading_index:])
break
else:
reading_index_tail = reading_index
while reading[reading_index_tail] != token.surface()[next_index] or (reading_index_tail < len(reading)-1 and reading[reading_index_tail] == reading[reading_index_tail+1]):
reading_index_tail += 1
parsed += to_anki_format(
index = surface_index,
kanji = token.surface()[surface_index:next_index],
reading = reading[reading_index:reading_index_tail])
reading_index = reading_index_tail
reading_length = next_index - surface_index
if reading_length > 0:
surface_index += reading_length
else:
break
def to_anki_format(index, kanji, reading):
return '{}{}[{}]'.format(' ' if index > 0 else '', kanji, reading)
Note: to run this example, you may need to run pip install sudachidict_small
in the repl shell to install the Sudachi dictionary.