acompany-develop · mdonaka · Oct 10, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/packages/client/libclient-py/README-ja.md b/packages/client/libclient-py/README-ja.md
@@ -39,3 +39,8 @@ $ pipenv run make fmt
 3.7.10
 
 テストは3.7,3.8,3.9でされているため，いずれも動作する．
+
+## 入力仕様
+`qpd.read_csv`で受け取るCSVデータではあらゆる数値，文字列を入力として受け付け，
+数値は64bit浮動小数，文字列は任意制度整数として扱う．
+そのため，64bit浮動小数で表現できない数値については精度が保証されない．特にマッチングで使用する列の場合は全く異なる値にparseされる可能性があるため，文字列として入力することを推奨する．
diff --git a/packages/client/libclient-py/README.md b/packages/client/libclient-py/README.md
@@ -34,3 +34,7 @@ $ pipenv run make fmt
 3.7.10
 
 Tests have been done with 3.7, 3.8, and 3.9, so they all work.
+
+## Input specifications
+The CSV data received with `qpd.read_csv` accepts any number or character string as input, and numbers are treated as 64-bit floating point numbers and character strings are treated as arbitrary system integers.
+Therefore, the precision of numbers that cannot be expressed as 64-bit floating point numbers is not guaranteed. Especially in the case of columns used for matching, it is recommended to input them as strings.
diff --git a/packages/client/libclient-py/quickmpc/pandas/parser.py b/packages/client/libclient-py/quickmpc/pandas/parser.py
@@ -1,4 +1,3 @@
-import csv
 from dataclasses import dataclass
 from hashlib import sha512
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
@@ -66,20 +65,16 @@ def format_check(secrets: List[List[ShareValueType]],
     return True
 
 
-def to_float(val: str) -> float:
-    """ If val is a float, convert as is; if it is a string, hash it. """
-    try:
-        return float(val)
-    except ValueError:
-        # k,m are constants used in the comparison operation
-        # Due to the limitation of comparison operation,
-        # k bits are taken out and divided by 2^m.
-        k: int = 48
-        m: int = 20
-        hs: str = sha512(val.encode()).hexdigest()
-        val_int: int = int(hs[:(k >> 2)], 16)
-        val_float: float = val_int / pow(2, m)
-        return val_float
+def to_float_for_matching(val: Union[str, int]) -> float:
+    # k,m are constants used in the comparison operation
+    # Due to the limitation of comparison operation,
+    # k bits are taken out and divided by 2^m.
+    k: int = 48
+    m: int = 20
+    hs: str = sha512(str(val).encode()).hexdigest()
+    val_int: int = int(hs[:(k >> 2)], 16)
+    val_float: float = val_int / pow(2, m)
+    return val_float
 
 
 def to_int(val: str, encoding='utf-8') -> int:
@@ -133,11 +128,11 @@ def find_types(schema: List[str],
 def convert(element: str,
             type_info: ShareValueTypeEnum.ValueType) -> ShareValueType:
     if type_info == ShareValueTypeEnum.Value('SHARE_VALUE_TYPE_FIXED_POINT'):
-        return to_float(element)
+        return float(element)
     if type_info == ShareValueTypeEnum.Value(
             'SHARE_VALUE_TYPE_UTF_8_INTEGER_REPRESENTATION'):
         return to_int(element)
-    return to_float(element)
+    return float(element)
 
 
 def parse(data: List[List[str]], matching_column: Optional[int] = None) \
@@ -146,7 +141,6 @@ def parse(data: List[List[str]], matching_column: Optional[int] = None) \
     types = find_types(schema_name, data[1:], matching_column)
     schema = [Schema(name=name, type=type)
               for name, type in zip(schema_name, types)]
-
     # check size first because an iterator which `zip` bultin function returns
     # stops when the shortest iterable is exhausted
     if not FormatChecker.check_size(data[1:], schema):
@@ -160,12 +154,3 @@ def parse(data: List[List[str]], matching_column: Optional[int] = None) \
         raise RuntimeError("規定されたフォーマットでないデータです．")
 
     return secrets, schema
-
-
-def parse_csv(
-    filename: str, matching_column: Optional[int] = None) \
-        -> Tuple[List[List[ShareValueType]], List[Schema]]:
-    with open(filename) as f:
-        reader = csv.reader(f)
-        text: List[List[str]] = [row for row in reader]
-        return parse(text, matching_column)
diff --git a/packages/client/libclient-py/quickmpc/pandas/readers.py b/packages/client/libclient-py/quickmpc/pandas/readers.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from quickmpc.pandas.parser import to_float
+from quickmpc.pandas.parser import to_float_for_matching
 
 
 def read_csv(*args, index_col: str, **kwargs) -> pd.DataFrame:
@@ -23,7 +23,7 @@ def read_csv(*args, index_col: str, **kwargs) -> pd.DataFrame:
     """
     df = pd.read_csv(*args, **kwargs)
     # ID列を数値化
-    df[index_col] = df[index_col].map(lambda x: to_float(x))
+    df[index_col] = df[index_col].map(lambda x: to_float_for_matching(x))
     # send_share時にID列でsortできる様にID列を座標圧縮した列を追加する
     df["__qmpc_sort_index__"] = df.index
     df = df.sort_values(by=index_col)

diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/bitvector.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/bitvector.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/diff_col.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/diff_col.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/edge_data.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/edge_data.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/empty.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/empty.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/none.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/none.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/normal.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/normal.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/not_csv.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/not_csv.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/over_number.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/over_number.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/string_data.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/string_data.csv
diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py b/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py
@@ -1,11 +1,8 @@
 import math
-import os
-from typing import List
 
-import numpy as np
 import pytest
 
-from quickmpc.pandas.parser import parse, parse_csv
+from quickmpc.pandas.parser import parse, to_float_for_matching
 from quickmpc.proto.common_types.common_types_pb2 import (Schema,
                                                           ShareValueTypeEnum)
 
@@ -21,96 +18,28 @@ def schema_int(name: str):
                   .SHARE_VALUE_TYPE_UTF_8_INTEGER_REPRESENTATION)
 
 
-# 元データ
-normal_data: List[List[str]] = [s.split(",") for s in [
-    "id,attr1,attr2,attr3,attr4,attr5,attr6",
-    "hoge,0,0.77,0.63,0.35,0.39,0.35",
-    "huga,0,0.37,0.36,0.43,0.41,0.39",
-    "piyo,1,0.34,0.34,0.44,0.50,0.32",
-    "moge,1,0.47,0.43,0.34,0.29,0.34",
-    "moga,0,0.67,0.41,0.25,0.49,0.25",
-]]
-data3: List[List[str]] = [s.split(",") for s in [
-    "id,id:id",
-    "hoge,hoge",
-    "huga,huga",
-    "moge,moge",
-    "moga,moga",
-]]
-
-# 正しくparseされたデータ
-d1_schema_str: List[str] = ['id', 'attr1', 'attr2',
-                            'attr3', 'attr4', 'attr5', 'attr6']
-d1_schema: List[Schema] = [schema_fp(name) for name in d1_schema_str]
-d1_secrets: List[List[float]] = [
-    [230379555.4797964, 0, 0.77, 0.63, 0.35, 0.39, 0.35],
-    [10723675.973257065, 0, 0.37, 0.36, 0.43, 0.41, 0.39],
-    [117576607.23670769, 1, 0.34, 0.34, 0.44, 0.5, 0.32],
-    [211114761.8482437, 1, 0.47, 0.43, 0.34, 0.29, 0.34],
-    [13292676.303739548, 0, 0.67, 0.41, 0.25, 0.49, 0.25]
-]
-
-d2_schema_str: List[str] = ['id#0', 'attr1#0', 'attr1#1',
-                            'attr2#0', 'attr2#1', 'attr2#2',
-                            'attr3#0', 'attr3#1', 'attr3#2', 'attr3#3']
-d2_schema: List[Schema] = [schema_fp(name) for name in d2_schema_str]
-d2_secrets: List[List[float]] = [
-    [230379555.4797964, 1, 0, 1, 0, 0, 1, 0, 0, 0],
-    [10723675.973257065, 1, 0, 0, 1, 0, 0, 1, 0, 0],
-    [211114761.8482437, 0, 1, 0, 0, 1, 0, 0, 1, 0],
-    [13292676.303739548, 1, 0, 1, 0, 0, 0, 0, 0, 1]
-]
-
-d3_schema: List[Schema] = [schema_int('id'), schema_fp('id:id')]
-d3_secrets: List[List[float]] = [
-    [1752131429, 230379555.4797964],
-    [1752524641, 10723675.973257065],
-    [1836017509, 211114761.8482437],
-    [1836017505, 13292676.303739548]
-]
-
-
-def test_parse():
-    """ 正しくパースできるかTest """
-    secrets, schema = parse(normal_data, matching_column=1)
-    assert (np.allclose(secrets, d1_secrets))
-    assert (schema == d1_schema)
-
-
-def test_parse_str():
-    secrets, schema = parse(data3)
-    assert (np.allclose(secrets, d3_secrets))
-    assert (schema == d3_schema)
-
-
-def test_parse_errorhandring():
-    """ 異常値を与えてエラーが出るかTest """
-    with pytest.raises(Exception):
-        # 行が足りずシェアがない
-        parse([["id", "a", "b", "c"]])
-    with pytest.raises(Exception):
-        # schemaに同じものが含まれる
-        parse([["id", "a", "a"],
-               ["id1", "1", "2"],
-               ["id2", "3", "4"]])
-    with pytest.raises(Exception):
-        # 正方行列でない
-        parse([["id", "a", "b"],
-               ["id1", "1", "2"],
-               ["id2", "3", "4", "5"]])
-        parse([["id", "a", "b"],
-               ["id1", "1", "2"],
-               ["id2"]])
-
-
 @pytest.mark.parametrize(
-    ("csv_file", "expected_secrets", "expected_schema"),
+    ("data", "expected_secrets", "expected_schema"),
     [
-        # 動作確認
-        ("normal.csv", d1_secrets, d1_schema),
-
-        # エッジケース
-        ("edge_data.csv",
+        # 通常case
+        ([["id", "a", "b"], ["1.0", "0", "0.77"], ["2.0", "0", "0.37"]],
+            [[1.0, 0, 0.77], [2.0, 0, 0.37]],
+            [schema_fp(name) for name in ["id", "a", "b"]]),
+        # tag付き
+        ([["id", "id:id"], ["str", "2.0"]],
+            [[7566450, 2.0]],
+         [schema_int('id'), schema_fp('id:id')]),
+
+        # edge case
+        ([["id", "zero", "int_max", "int_min",
+           "float_min_plus", "float_max_minus", "string_max"],
+          ["0", "0",
+           "10000000000", "-10000000000", "0.00000000001", "-0.00000000001",
+           "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
+           "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
+           "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
+           "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
+           "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"]],
          [[0.0, 0.0, 10000000000.0, -10000000000.0, 1e-11, -1e-11,
            int("81294350169683468997949680580862592771577912922072"
                "36632400050104509615137476244113137539228236962890"
@@ -140,7 +69,20 @@ def test_parse_errorhandring():
           schema_fp('float_max_minus'), schema_int('string_max'), ]),
 
         # 文字列
-        ("string_data.csv",
+        ([["id", "alphabet", "hiragana", "katakana",
+           "chinese_characters", "large_number", "emoji"],
+            ["0", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
+             "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそ"
+             "ぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺ"
+             "ほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ",
+             "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソ"
+             "ゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペ"
+             "ホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ",
+             "春眠不覚暁処処聞啼鳥夜来風雨声花落知多少",
+             "１２３４５６７８９０",
+             "😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺🙂🤗🤩🤔🤨😐😑😶🙄😏😣😥"
+             "😮🤐😯😪😫😴😌😛😜😝🤤😒😓😔😕🙃🤑😲☹🙁😖😞😟😤😢😭😦😧😨😩"
+             "🤯😬😰😱😳🤪😵😡😠🤬😷🤒🤕🤢🤮🤧😇🤠🤡🤥🤫🤭🧐🤓"]],
          [[0.0,
            int("64376492020959182960102910068921920137578270955447"
                "00994229324997296606723909755334743502970718979797"
@@ -197,14 +139,11 @@ def test_parse_errorhandring():
              schema_int('katakana'), schema_int('chinese_characters'),
              schema_fp('large_number'),  # TODO: 文字列として解釈してほしい
              schema_int('emoji'),
-         ]),
+        ]),
     ]
 )
-def test_parse_csv(csv_file, expected_secrets, expected_schema):
-    """ csvを正しくパースできるかTest """
-    secrets, schema = parse_csv(
-        f"{os.path.dirname(__file__)}/test_files/{csv_file}",
-        matching_column=1)
+def test_parse(data, expected_secrets, expected_schema):
+    secrets, schema = parse(data)
     for row, row_expected in zip(secrets, expected_secrets):
         for x, y in zip(row, row_expected):
             if type(x) == int:
@@ -215,25 +154,52 @@ def test_parse_csv(csv_file, expected_secrets, expected_schema):
 
 
 @pytest.mark.parametrize(
-    ("csv_file", "expected_exception"),
+    ("data", "expected_exception"),
     [
-        # ファイルが存在しない
-        ("hoge", Exception),
-
-        # 列数が異なる
-        ("diff_col.csv", Exception),
+        # 行が足りずシェアがない
+        ([["id", "a", "b", "c"]], RuntimeError),
+        # schemaに同じものが含まれる
+        ([["id", "a", "a"],
+          ["id1", "1", "2"],
+          ["id2", "3", "4"]], RuntimeError),
+        # 正方行列でない
+        ([["id", "a", "b"],
+          ["id1", "1", "2"],
+          ["id2", "3", "4", "5"]], RuntimeError),
+        ([["id", "a", "b"],
+          ["id1", "1", "2"],
+          ["id2"]], RuntimeError),
+        # tableが空
+        ([["id"]], RuntimeError),
+        # csv形式でない
+        ({"name": "I am json"}, KeyError),
+    ]
 
-        # テーブルが空
-        ("empty.csv", Exception),
+)
+def test_parse_errorhandring(data, expected_exception):
+    """ 異常値を与えてエラーが出るかTest """
+    with pytest.raises(expected_exception):
+        parse(data)
 
-        # 空のデータが存在する
-        ("none.csv", Exception),
 
-        # csv形式じゃない
-        ("not_csv.csv", Exception),
+@pytest.mark.parametrize(
+    ("val", "expected"),
+    [
+        # 文字列
+        ("id1", 125382372.3739109),
+        ("hogehuga@gmail.com", 86705962.83638954),
+        ("very_very_very_very_very_very_very_very_very_long_string",
+         190655731.5899248),
+        ("日本語の文字列デスヨ", 40972936.22852039),
+        ("❗️✨🤟😁👍感謝❗️🙌✨感謝❗️🙌✨❗️🍖😋🍴✨", 62936327.66452408),
+        ("", 217595411.34348965),
+        # 数値
+        (0, 52152834.036356926),
+        (1, 81786090.20335388),
+        (1000000000000000000000, 35709723.87166405),
+        (1.1, 199269155.31771374),
+        (1000000000000000000000.11111111111111111111, 19815069.039226532),
     ]
 )
-def test_parse_csv_errorhandring(csv_file, expected_exception):
-    """ 異常値を与えてエラーが出るかTest """
-    with pytest.raises(expected_exception):
-        parse_csv(f"{os.path.dirname(__file__)}/test_files/{csv_file}")
+def test_to_float_for_maching(val, expected):
+    assert math.isclose(to_float_for_matching(val), expected)