include meanings

odjhey · Nov 24, 2024 · 16b93b6 · 16b93b6
1 parent 70e11a9
commit 16b93b6
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 24 deletions.
diff --git a/explore/.flake8 b/explore/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 128
diff --git a/explore/explore/__init__.py b/explore/explore/__init__.py
@@ -5,5 +5,7 @@ def run():
     return main.run()
 
 
-def process_words(df, column_name):
-    return main.process_words_from_dataframe(df, column_name)
+def process_words(df):
+    return main.process_words_from_dataframe(
+        df,
+    )
diff --git a/explore/explore/main.py b/explore/explore/main.py
@@ -15,21 +15,36 @@ def run():
     return flattened_data
 
 
-def process_words_from_dataframe(df, column_name):
+def process_words_from_dataframe(df):
+    """
+    Processes the words in a DataFrame to count kanji occurrences
+    and map them to their originating metadata.
+
+    Args:
+        df (pd.DataFrame): The input DataFrame with columns like 'Word', 'id', 'Transliteration', 'Meaning'.
+
+    Returns:
+        kanji_counter (Counter): Counts of kanji occurrences.
+        kanji_tree (dict): Mapping of kanji to metadata from the original DataFrame.
+    """
     # Define regex pattern to match only kanji
     kanji_pattern = re.compile(r"\p{Script=Han}")
 
     kanji_counter = Counter()  # To count kanji occurrences
-    kanji_tree = {}  # To map kanji to their originating words
+    kanji_tree = {}  # To map kanji to their originating rows
 
-    # Iterate through the column
-    for word in df[column_name].dropna():
-        # Remove kana and other non-kanji characters
-        kanji_only = "".join(kanji_pattern.findall(word))
+    # Iterate through rows in the DataFrame
+    for _, row in df.dropna(subset=["Word"]).iterrows():
+        word = row["Word"]
+        kanji_only = "".join(kanji_pattern.findall(word))  # Extract kanji
 
-        # Count kanji
         for kanji in kanji_only:
+            # Increment kanji count
             kanji_counter[kanji] += 1
-            kanji_tree.setdefault(kanji, set()).add(word)
+
+            # Add metadata to the kanji_tree
+            if kanji not in kanji_tree:
+                kanji_tree[kanji] = []
+            kanji_tree[kanji].append(row.to_dict())  # Store entire row as metadata
 
     return kanji_counter, kanji_tree
diff --git a/explore/notebooks/explore.ipynb b/explore/notebooks/explore.ipynb
diff --git a/harvest-data/.flake8 b/harvest-data/.flake8
@@ -1,2 +1,2 @@
 [flake8]
-max-line-length = 100
+max-line-length = 128
diff --git a/harvest-data/harvest_data/upload_to_anki.py b/harvest-data/harvest_data/upload_to_anki.py
@@ -10,7 +10,6 @@
 def get_kanji_map():
     kanji_map = pickle.load(open("inputs/kanji_map.pkl", "rb"))
 
-    print(list(kanji_map))
     # convert all to list
     return list(kanji_map.values())
 
@@ -21,15 +20,36 @@ def get_kanji_map():
     # return [first_item]
 
 
-def upload_as_anki_note(kanji, canonical, words):
+def upload_as_anki_note(kanji, canonical, wordsMeta):
     """Add a kanji note to Anki."""
+
+    # words has format
+    # [{'Word': '起こす', 'Transliteration': 'おこす', 'Meaning': 'wake (someone) up', 'PoS': 'Verb'},
+    #  {'Word': '起きる', 'Transliteration': 'おきる', 'Meaning': 'occur, happen', 'PoS': 'Verb'},
+    #  {'Word': '起きる', 'Transliteration': 'おきる', 'Meaning': 'get up, get out of bed', 'PoS': 'Verb'},
+    #  {'Word': '起こる', 'Transliteration': 'おこる', 'Meaning': 'happen', 'PoS': 'Verb'},
+    #  {'Word': '早起き', 'Transliteration': 'はやおき', 'Meaning': 'getting up early', 'PoS': 'Verbal Noun'}]
+
+    words = [word["Word"] for word in wordsMeta]
     words_string = "<br/>".join(words)
 
+    words_with_transliteration = [
+        f"{ word['Word'] } ({word['Transliteration']}) - {word['Meaning']}"
+        for word in wordsMeta
+    ]
+
+    words_with_transliteration_string = "<br/>".join(words_with_transliteration)
+
     # Construct the note
     note = {
         "deckName": DECK_NAME,
         "modelName": MODEL_NAME,
-        "fields": {"Kanji": kanji, "Words": words_string, "CanonicalId": canonical},
+        "fields": {
+            "Kanji": kanji,
+            "Words": words_string,
+            "CanonicalId": canonical,
+            "WordsWithMeaning": words_with_transliteration_string,
+        },
     }
 
     # Send request to AnkiConnect