주석 번역

rickiepark · Aug 1, 2021 · 7c16164 · 7c16164
1 parent 5e627bb
commit 7c16164
Show file tree

Hide file tree

Showing 20 changed files with 291 additions and 308 deletions.
diff --git a/ml_editor/data_ingestion.py b/ml_editor/data_ingestion.py
@@ -12,11 +12,11 @@
 
 def generate_model_text_features(raw_df_path, save_path=None):
     """
-    A function to generate features for model 2 and save them to disk.
-    These features take multiple minutes to compute
-    :param raw_df_path: path to raw DataFrame (generated from parse_xml_to_csv)
-    :param save_path: path to save processed DataFrame to
-    :return: processed DataFrame
+    모델 2를 위한 특성을 생성하고 디스크에 저장하는 함수
+    이 특성을 계산하는데 몇 분 정도 걸립니다.
+    :param raw_df_path: (parse_xml_to_csv에서 생성한) 원본 DataFrame 경로
+    :param save_path: 처리된 DataFrame을 저장할 경로
+    :return: 처리된 DataFrame
     """
     df = pd.read_csv(raw_df_path)
     df = format_raw_df(df.copy())
@@ -33,25 +33,25 @@ def generate_model_text_features(raw_df_path, save_path=None):
 
 def parse_xml_to_csv(path, save_path=None):
     """
-    Open .xml posts dump and convert the text to a csv, tokenizing it in the process
-    :param path: path to the xml document containing posts
-    :return: a dataframe of processed text
+    .xml 포스트 덤프를 열어 텍스트에서 csv로 변환합니다.
+    :param path: 포스트가 담긴 xml 문서의 경로
+    :return: 처리된 텍스트의 데이터프레임
     """
 
-    # Use python's standard library to parse XML file
+    # 파이썬의 표준 라이브러리로 XML 파일을 파싱합니다.
     doc = ElT.parse(path)
     root = doc.getroot()
 
-    # Each row is a question
+    # 각 행은 하나의 질문입니다.
     all_rows = [row.attrib for row in root.findall("row")]
 
-    # Using tdqm to display progress since preprocessing takes time
+    # tdqm을 사용해 전처리 과정을 표시합니다.
     for item in tqdm(all_rows):
-        # Decode text from HTML
+        # HTML에서 텍스트를 추출합니다.
         soup = BeautifulSoup(item["Body"], features="html.parser")
         item["body_text"] = soup.get_text()
 
-    # Create dataframe from our list of dictionaries
+    # 딕셔너리의 리스트에서 데이터프레임을 만듭니다.
     df = pd.DataFrame.from_dict(all_rows)
     if save_path:
         df.to_csv(save_path)
@@ -60,10 +60,10 @@ def parse_xml_to_csv(path, save_path=None):
 
 def get_data_from_dump(site_name, load_existing=True):
     """
-    load .xml dump, parse it to a csv, serialize it and return it
-    :param load_existing: should we load the existing extract or regenerate it
-    :param site_name: name of the stackexchange website
-    :return: pandas DataFrame of the parsed xml
+    .xml 덤프를 로드하고, 파싱하여 csv로 만들고, 직렬화한 다음 반환합니다.
+    :param load_existing: 기존에 추출한 csv를 로드할지 새로 생성할지 결정합니다.
+    :param site_name: 스택익스체인지 웹사이트 이름
+    :return: 파싱된 xml의 판다스 DataFrame
     """
     data_path = Path("data")
     dump_name = "%s.stackexchange.com/Posts.xml" % site_name

diff --git a/ml_editor/data_processing.py b/ml_editor/data_processing.py
@@ -6,11 +6,11 @@
 
 def format_raw_df(df):
     """
-    Cleanup data and join questions to answers
-    :param df: raw DataFrame
-    :return: processed DataFrame
+    데이터를 정제하고 질문과 대답을 합칩니다.
+    :param df: 원본 DataFrame
+    :return: 처리된 DataFrame
     """
-    # Fixing types and setting index
+    # 타입을 고치고 인덱스를 설정합니다.
     df["PostTypeId"] = df["PostTypeId"].astype(int)
     df["Id"] = df["Id"].astype(int)
     df["AnswerCount"] = df["AnswerCount"].fillna(-1)
@@ -21,10 +21,10 @@ def format_raw_df(df):
 
     df["is_question"] = df["PostTypeId"] == 1
 
-    # Filtering out PostTypeIds other than documented ones
+    # 문서화된 것 이외의 PostTypeId를 필터링합니다.
     df = df[df["PostTypeId"].isin([1, 2])]
 
-    # Linking questions and answers
+    # 질문과 대답을 연결합니다.
     df = df.join(
         df[["Id", "Title", "body_text", "Score", "AcceptedAnswerId"]],
         on="ParentId",
@@ -36,11 +36,10 @@ def format_raw_df(df):
 
 def train_vectorizer(df):
     """
-    Train a vectorizer for some data.
-    Returns the vectorizer to be used to transform non-training data, in
-    addition to the training vectors
-    :param df: data to use to train the vectorizer
-    :return: trained vectorizers and training vectors
+    벡터화 객체를 훈련합니다.
+    훈련 데이터와 그 외 데이터를 변환하는데 사용할 벡터화 객체를 반환합니다.
+    :param df: 벡터화 객체를 훈련하는데 사용할 데이터
+    :return: 훈련된 벡터화 객체
     """
     vectorizer = TfidfVectorizer(
         strip_accents="ascii", min_df=5, max_df=0.5, max_features=10000
@@ -52,10 +51,10 @@ def train_vectorizer(df):
 
 def get_vectorized_series(text_series, vectorizer):
     """
-    Vectorizes an input series using a pre-trained vectorizer
-    :param text_series: pandas Series of text
-    :param vectorizer: pretrained sklearn vectorizer
-    :return: array of vectorized features
+    사전 훈련된 벡터화 객체를 사용해 입력 시리즈를 벡터화합니다.
+    :param text_series: 텍스트의 판다스 시리즈
+    :param vectorizer: 사전 훈련된 sklearn의 벡터화 객체
+    :return: 벡터화된 특성 배열
     """
     vectors = vectorizer.transform(text_series)
     vectorized_series = [vectors[i] for i in range(vectors.shape[0])]
@@ -64,10 +63,9 @@ def get_vectorized_series(text_series, vectorizer):
 
 def add_text_features_to_df(df):
     """
-    Ads features to DataFrame
+    DataFrame에 특성을 추가합니다.
     :param df: DataFrame
-    :param pretrained_vectors: whether to use pretrained vectors for embeddings
-    :return: DataFrame with additional features
+    :return: 특성이 추가된 DataFrame
     """
     df["full_text"] = df["Title"].str.cat(df["body_text"], sep=" ", na_rep="")
     df = add_v1_features(df.copy())
@@ -77,9 +75,9 @@ def add_text_features_to_df(df):
 
 def add_v1_features(df):
     """
-    Add our first features to an input DataFrame
-    :param df: DataFrame of questions
-    :return: DataFrame with added feature columns
+    입력 DataFrame에 첫 번째 특성을 추가합니다.
+    :param df: 질문 DataFrame
+    :return: 특성이 추가된 DataFrame
     """
     df["action_verb_full"] = (
         df["full_text"].str.contains("can", regex=False)
@@ -98,9 +96,9 @@ def add_v1_features(df):
 
 def get_vectorized_inputs_and_label(df):
     """
-    Concatenate DataFrame features with text vectors
-    :param df: DataFrame with calculated features
-    :return: concatenated vector consisting of features and text
+    DataFrame 특성과 텍스트 벡터를 연결합니다.
+    :param df: 계산된 특성의 DataFrame
+    :return: 특성과 텍스트로 구성된 벡터
     """
     vectorized_features = np.append(
         np.vstack(df["vectors"]),
@@ -135,21 +133,21 @@ def get_feature_vector_and_label(df, feature_names):
 
 def get_normalized_series(df, col):
     """
-    Get a normalized version of a column
+    DataFrame 열을 정규화합니다.
     :param df: DataFrame
-    :param col: column name
-    :return: normalized series using Z-score
+    :param col: 열 이름
+    :return: Z-점수를 사용해 정규화된 시리즈 객체
     """
     return (df[col] - df[col].mean()) / df[col].std()
 
 
 def get_random_train_test_split(posts, test_size=0.3, random_state=40):
     """
-    Get train/test split from DataFrame
-    Assumes the DataFrame has one row per question example
-    :param posts: all posts, with their labels
-    :param test_size: the proportion to allocate to test
-    :param random_state: a random seed
+    DataFrame을 훈련/테스트 세트로 나눕니다.
+    DataFrame이 질문마다 하나의 행을 가진다고 가정합니다.
+    :param posts: 모든 포스트와 레이블
+    :param test_size: 테스트 세트로 할당할 비율
+    :param random_state: 랜덤 시드
     """
     return train_test_split(
         posts, test_size=test_size, random_state=random_state
@@ -160,12 +158,12 @@ def get_split_by_author(
     posts, author_id_column="OwnerUserId", test_size=0.3, random_state=40
 ):
     """
-    Get train/test split
-    Guarantee every author only appears in one of the splits
-    :param posts: all posts, with their labels
-    :param author_id_column: name of the column containing the author_id
-    :param test_size: the proportion to allocate to test
-    :param random_state: a random seed
+    훈련 세트와 테스트 세트로 나눕니다.
+    작성자가 두 세트 중에 하나에만 등장하는 것을 보장합니다.
+    :param posts: 모든 포스트와 레이블
+    :param author_id_column: author_id가 들어 있는 열 이름
+    :param test_size: 테스트 세트로 할당할 비율
+    :param random_state: 랜덤 시드
     """
     splitter = GroupShuffleSplit(
         n_splits=1, test_size=test_size, random_state=random_state

diff --git a/ml_editor/data_visualization.py b/ml_editor/data_visualization.py
@@ -4,9 +4,9 @@
 
 def plot_embeddings(embeddings, sent_labels):
     """
-    Plot embeddings, colored by sentence label
-    :param embeddings: two dimensional embeddings
-    :param sent_labels: labels to display
+    문장 레이블에 따라 색을 입힌 임베딩 그래프 그리기
+    :param embeddings: 2차원 임베딩
+    :param sent_labels: 출력할 레이블
     """
     fig = plt.figure(figsize=(16, 10))
     color_map = {True: "#1f77b4", False: "#ff7f0e"}

diff --git a/ml_editor/explanation_generation.py b/ml_editor/explanation_generation.py
@@ -78,9 +78,9 @@
 
 def get_explainer():
     """
-    Prepare LIME explainer using our training data. This is fast enough that
-    we do not bother with serializing it
-    :return: LIME explainer object
+    훈련 데이터를 사용해 LIME 설명 도구를 준비합니다.
+    직렬화하지 않아도 될만큼 충분히 빠릅니다.
+    :return: LIME 설명 도구 객체
     """
     curr_path = Path(os.path.dirname(__file__))
     data_path = Path("../data/writers_with_features.csv")
@@ -99,9 +99,9 @@ def get_explainer():
 
 def simplify_order_sign(order_sign):
     """
-    Simplify signs to make display clearer for users
-    :param order_sign: Input comparison operator
-    :return: Simplifier operator
+    사용자에게 명확한 출력을 위해 기호를 단순화합니다.
+    :param order_sign: 비교 연산자 입력
+    :return: 단순화된 연산자
     """
     if order_sign in ["<=", "<"]:
         return "<"
@@ -112,29 +112,29 @@ def simplify_order_sign(order_sign):
 
 def get_recommended_modification(simple_order, impact):
     """
-    Generate a recommendation string from an operator and the type of impact
-    :param simple_order: simplified operator
-    :param impact: whether the change has positive or negative impact
-    :return: formatted recommendation string
+    연산자와 영향 타입에 따라 추천 문장을 생성합니다.
+    :param simple_order: 단순화된 연산자
+    :param impact: 변화가 긍정적인지 부정적인지 여부
+    :return: 추천 문자열
     """
     bigger_than_threshold = simple_order == ">"
     has_positive_impact = impact > 0
 
     if bigger_than_threshold and has_positive_impact:
-        return "No need to decrease"
+        return "높일 필요가 없습니다"
     if not bigger_than_threshold and not has_positive_impact:
-        return "Increase"
+        return "높이세요"
     if bigger_than_threshold and not has_positive_impact:
-        return "Decrease"
+        return "낮추세요"
     if not bigger_than_threshold and has_positive_impact:
-        return "No need to increase"
+        return "낮출 필요가 없습니다"
 
 
 def parse_explanations(exp_list):
     """
-    Parse explanations returned by LIME into a user readable format
-    :param exp_list: explanations returned by LIME explainer
-    :return: array of dictionaries containing user facing strings
+    LIME이 반환한 설명을 사용자가 읽을 수 있도록 파싱합니다.
+    :param exp_list: LIME 설명 도구가 반환한 설명
+    :return: 사용자에게 전달한 문자열을 담은 딕셔너리 배열
     """
     parsed_exps = []
     for feat_bound, impact in exp_list:
@@ -163,9 +163,9 @@ def parse_explanations(exp_list):
 
 def get_recommendation_string_from_parsed_exps(exp_list):
     """
-    Generate recommendation text we can display on a flask app
-    :param exp_list: array of dictionaries containing explanations
-    :return: HTML displayable recommendation text
+    플래스크 앱에서 출력할 수 있는 추천 텍스트를 생성합니다.
+    :param exp_list: 설명을 담은 딕셔너리의 배열
+    :return: HTML 추천 텍스트
     """
     recommendations = []
     for i, feature_exp in enumerate(exp_list):

diff --git a/ml_editor/inference.py b/ml_editor/inference.py
@@ -1,5 +1,5 @@
-"""inference.py: This module contains function stubs serving as book examples.
-The functions are not used for the ml_editor app or notebook
+"""inference.py: 이 모듈을 책의 예제를 서빙하기 위한 스텁(stub) 함수를 담고 있습니다.
+이 함수는 ml_editor나 노트북에서 사용되지 않습니다.
 """
 
 from functools import lru_cache
@@ -43,45 +43,44 @@ def run_heuristic(question_len):
 @lru_cache(maxsize=128)
 def run_model(question_data):
     """
-    This is a stub function. We actually use the lru_cache with a purpose
-    in app.py
+    스텁 함수입니다. 실제로 app.py에서 lru_cache를 사용합니다.
     :param question_data:
     """
-    # Insert any slow model inference below
+    # 아래 느린 모델 추론을 추가하세요.
     pass
 
 
 def validate_and_handle_request(question_data):
     missing = find_absent_features(question_data)
     if len(missing) > 0:
-        raise ValueError("Missing feature(s) %s" % missing)
+        raise ValueError("누락된 특성: %s" % missing)
 
     wrong_types = check_feature_types(question_data)
     if len(wrong_types) > 0:
-        # If data is wrong but we have the length of the question, run heuristic
+        # 데이터가 잘못되었지만 질문의 길이가 있다면 경험 규칙을 실행합니다.
         if "text_len" in question_data.keys():
             if isinstance(question_data["text_len"], float):
                 return run_heuristic(question_data["text_len"])
-        raise ValueError("Incorrect type(s) %s" % wrong_types)
+        raise ValueError("잘못된 타입: %s" % wrong_types)
 
     return run_model(question_data)
 
 
 def verify_output_type_and_range(output):
     if not isinstance(output, float):
-        raise ValueError("Wrong output type %s, %s" % (output, type(output)))
+        raise ValueError("잘못된 출력 타입: %s, %s" % (output, type(output)))
     if not 0 < output < 1:
-        raise ValueError("Output out of range %s, %s" % output)
+        raise ValueError("범위 밖의 출력: %s" % output)
 
 
 def validate_and_correct_output(question_data, model_output):
-    # Verify type and range and raise errors accordingly
+    # 타입과 범위를 검사해 적절히 에러를 발생시킵니다.
     try:
-        # Raises value error if model output is incorrect
+        # 모델 출력이 잘못되면 에러를 발생시킵니다.
         verify_output_type_and_range(model_output)
     except ValueError:
-        # We run a heuristic, but could run a different model here
+        # 경험 규칙을 실행하지만 다른 모델을 실행할 수도 있습니다.
         run_heuristic(question_data["text_len"])
 
-    # If we did not raise an error, we return our model result
+    # 에러가 발생되지 않으면 모델 결과를 반환합니다.
     return model_output