Skip to content

Commit

Permalink
include meanings
Browse files Browse the repository at this point in the history
  • Loading branch information
odjhey committed Nov 24, 2024
1 parent 70e11a9 commit 16b93b6
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 24 deletions.
2 changes: 2 additions & 0 deletions explore/.flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
max-line-length = 128
6 changes: 4 additions & 2 deletions explore/explore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,7 @@ def run():
return main.run()


def process_words(df, column_name):
return main.process_words_from_dataframe(df, column_name)
def process_words(df):
return main.process_words_from_dataframe(
df,
)
31 changes: 23 additions & 8 deletions explore/explore/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,36 @@ def run():
return flattened_data


def process_words_from_dataframe(df, column_name):
def process_words_from_dataframe(df):
"""
Processes the words in a DataFrame to count kanji occurrences
and map them to their originating metadata.
Args:
df (pd.DataFrame): The input DataFrame with columns like 'Word', 'id', 'Transliteration', 'Meaning'.
Returns:
kanji_counter (Counter): Counts of kanji occurrences.
kanji_tree (dict): Mapping of kanji to metadata from the original DataFrame.
"""
# Define regex pattern to match only kanji
kanji_pattern = re.compile(r"\p{Script=Han}")

kanji_counter = Counter() # To count kanji occurrences
kanji_tree = {} # To map kanji to their originating words
kanji_tree = {} # To map kanji to their originating rows

# Iterate through the column
for word in df[column_name].dropna():
# Remove kana and other non-kanji characters
kanji_only = "".join(kanji_pattern.findall(word))
# Iterate through rows in the DataFrame
for _, row in df.dropna(subset=["Word"]).iterrows():
word = row["Word"]
kanji_only = "".join(kanji_pattern.findall(word)) # Extract kanji

# Count kanji
for kanji in kanji_only:
# Increment kanji count
kanji_counter[kanji] += 1
kanji_tree.setdefault(kanji, set()).add(word)

# Add metadata to the kanji_tree
if kanji not in kanji_tree:
kanji_tree[kanji] = []
kanji_tree[kanji].append(row.to_dict()) # Store entire row as metadata

return kanji_counter, kanji_tree
81 changes: 71 additions & 10 deletions explore/notebooks/explore.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion harvest-data/.flake8
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[flake8]
max-line-length = 100
max-line-length = 128
26 changes: 23 additions & 3 deletions harvest-data/harvest_data/upload_to_anki.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
def get_kanji_map():
kanji_map = pickle.load(open("inputs/kanji_map.pkl", "rb"))

print(list(kanji_map))
# convert all to list
return list(kanji_map.values())

Expand All @@ -21,15 +20,36 @@ def get_kanji_map():
# return [first_item]


def upload_as_anki_note(kanji, canonical, words):
def upload_as_anki_note(kanji, canonical, wordsMeta):
"""Add a kanji note to Anki."""

# words has format
# [{'Word': '起こす', 'Transliteration': 'おこす', 'Meaning': 'wake (someone) up', 'PoS': 'Verb'},
# {'Word': '起きる', 'Transliteration': 'おきる', 'Meaning': 'occur, happen', 'PoS': 'Verb'},
# {'Word': '起きる', 'Transliteration': 'おきる', 'Meaning': 'get up, get out of bed', 'PoS': 'Verb'},
# {'Word': '起こる', 'Transliteration': 'おこる', 'Meaning': 'happen', 'PoS': 'Verb'},
# {'Word': '早起き', 'Transliteration': 'はやおき', 'Meaning': 'getting up early', 'PoS': 'Verbal Noun'}]

words = [word["Word"] for word in wordsMeta]
words_string = "<br/>".join(words)

words_with_transliteration = [
f"{ word['Word'] } ({word['Transliteration']}) - {word['Meaning']}"
for word in wordsMeta
]

words_with_transliteration_string = "<br/>".join(words_with_transliteration)

# Construct the note
note = {
"deckName": DECK_NAME,
"modelName": MODEL_NAME,
"fields": {"Kanji": kanji, "Words": words_string, "CanonicalId": canonical},
"fields": {
"Kanji": kanji,
"Words": words_string,
"CanonicalId": canonical,
"WordsWithMeaning": words_with_transliteration_string,
},
}

# Send request to AnkiConnect
Expand Down

0 comments on commit 16b93b6

Please sign in to comment.