From 24941b32395df6365a52a6a18b15ca21ff75f548 Mon Sep 17 00:00:00 2001 From: Nakata Date: Mon, 25 Sep 2023 15:20:31 +0900 Subject: [PATCH 1/7] Add to_float for matching --- .../libclient-py/quickmpc/pandas/parser.py | 12 ++++++++++ .../tests/unit_tests/pandas/test_parser.py | 24 ++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/packages/client/libclient-py/quickmpc/pandas/parser.py b/packages/client/libclient-py/quickmpc/pandas/parser.py index dad49fd6e..0f32d5bdd 100644 --- a/packages/client/libclient-py/quickmpc/pandas/parser.py +++ b/packages/client/libclient-py/quickmpc/pandas/parser.py @@ -66,6 +66,18 @@ def format_check(secrets: List[List[ShareValueType]], return True +def to_float_for_matching(val: Union[str, int]) -> float: + # k,m are constants used in the comparison operation + # Due to the limitation of comparison operation, + # k bits are taken out and divided by 2^m. + k: int = 48 + m: int = 20 + hs: str = sha512(str(val).encode()).hexdigest() + val_int: int = int(hs[:(k >> 2)], 16) + val_float: float = val_int / pow(2, m) + return val_float + + def to_float(val: str) -> float: """ If val is a float, convert as is; if it is a string, hash it. """ try: diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py b/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py index 69f66716a..dfe922f07 100644 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py +++ b/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from quickmpc.pandas.parser import parse, parse_csv +from quickmpc.pandas.parser import parse, parse_csv, to_float_for_matching from quickmpc.proto.common_types.common_types_pb2 import (Schema, ShareValueTypeEnum) @@ -237,3 +237,25 @@ def test_parse_csv_errorhandring(csv_file, expected_exception): """ 異常値を与えてエラーが出るかTest """ with pytest.raises(expected_exception): parse_csv(f"{os.path.dirname(__file__)}/test_files/{csv_file}") + + +@pytest.mark.parametrize( + ("val", "expected"), + [ + # 文字列 + ("id1", 125382372.3739109), + ("hogehuga@gmail.com", 86705962.83638954), + ("very_very_very_very_very_very_very_very_very_long_string", 190655731.5899248), + ("日本語の文字列デスヨ", 40972936.22852039), + ("❗️✨🤟😁👍感謝❗️🙌✨感謝❗️🙌✨❗️🍖😋🍴✨", 62936327.66452408), + ("", 217595411.34348965), + # 数値 + (0, 52152834.036356926), + (1, 81786090.20335388), + (1000000000000000000000, 35709723.87166405), + (1.1, 199269155.31771374), + (1000000000000000000000.11111111111111111111, 19815069.039226532), + ] +) +def test_to_float_for_maching(val, expected): + assert math.isclose(to_float_for_matching(val), expected) From b9b55c30d51c4f663a743b8ffa0aa6d725547e01 Mon Sep 17 00:00:00 2001 From: Nakata Date: Mon, 25 Sep 2023 15:28:56 +0900 Subject: [PATCH 2/7] Update read_csv --- .../libclient-py/quickmpc/pandas/readers.py | 4 ++-- .../tests/unit_tests/pandas/test_reader.py | 21 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/packages/client/libclient-py/quickmpc/pandas/readers.py b/packages/client/libclient-py/quickmpc/pandas/readers.py index 07f408081..6a2b91908 100644 --- a/packages/client/libclient-py/quickmpc/pandas/readers.py +++ b/packages/client/libclient-py/quickmpc/pandas/readers.py @@ -1,6 +1,6 @@ import pandas as pd -from quickmpc.pandas.parser import to_float +from quickmpc.pandas.parser import to_float_for_matching def read_csv(*args, index_col: str, **kwargs) -> pd.DataFrame: @@ -23,7 +23,7 @@ def read_csv(*args, index_col: str, **kwargs) -> pd.DataFrame: """ df = pd.read_csv(*args, **kwargs) # ID列を数値化 - df[index_col] = df[index_col].map(lambda x: to_float(x)) + df[index_col] = df[index_col].map(lambda x: to_float_for_matching(x)) # send_share時にID列でsortできる様にID列を座標圧縮した列を追加する df["__qmpc_sort_index__"] = df.index df = df.sort_values(by=index_col) diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py b/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py index 02d934614..7ad830e0f 100644 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py +++ b/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py @@ -15,20 +15,27 @@ def to_string_io(data: List[List]) -> io.StringIO: @pytest.mark.parametrize( ("data", "index_col", "expected"), [ # IDの順序がそのまま - ([["id", "c"], ["a", 1], ["b", 2]], - "id", + ([["id", "c"], ["a", 1], ["b", 2]], "id", pd.DataFrame([[32772040.0, 1, 0], [86407020.0, 2, 1]], columns=["id", "c", "__qmpc_sort_index__"])), # IDの順序が逆 - ([["id", "c"], ["b", 2], ["a", 1]], - "id", + ([["id", "c"], ["b", 2], ["a", 1]], "id", pd.DataFrame([[86407020.0, 2, 1], [32772040.0, 1, 0]], columns=["id", "c", "__qmpc_sort_index__"])), # 1列目以外をID列に指定した場合 - ([["id", "c"], ["a", 1], ["b", 2]], - "c", - pd.DataFrame([["a", 1.0, 0], ["b", 2.0, 1]], + ([["id", "c"], ["a", 1], ["b", 2]], "c", + pd.DataFrame([["a", 81786090.20335388, 1], ["b", 67839041.07183933, 0]], columns=["id", "c", "__qmpc_sort_index__"])), + # ID列が巨大な整数 + ([["id"], [1000000000000000000000000]], "id", + pd.DataFrame([[250298887.90448284, 0]], columns=["id", "__qmpc_sort_index__"])), + ([["id"], [-1000000000000000000000000]], "id", + pd.DataFrame([[146263071.2934265, 0]], columns=["id", "__qmpc_sort_index__"])), + # ID列が巨大な実数 + ([["id"], [1000000000000000000000000.1111111111111111]], "id", + pd.DataFrame([[108235191.58269978, 0]], columns=["id", "__qmpc_sort_index__"])), + ([["id"], [-1000000000000000000000000.1111111111111111]], "id", + pd.DataFrame([[264110776.18418598, 0]], columns=["id", "__qmpc_sort_index__"])), ] ) def test_read_csv(data, index_col, expected, From 1ff9c38656fdd729109c1c7f70e183badc2af1ac Mon Sep 17 00:00:00 2001 From: Nakata Date: Mon, 25 Sep 2023 17:58:58 +0900 Subject: [PATCH 3/7] Remove parse_csv --- .../libclient-py/quickmpc/pandas/parser.py | 30 +--- .../pandas/test_files/bitvector.csv | 5 - .../unit_tests/pandas/test_files/diff_col.csv | 6 - .../pandas/test_files/edge_data.csv | 2 - .../unit_tests/pandas/test_files/empty.csv | 1 - .../unit_tests/pandas/test_files/none.csv | 3 - .../unit_tests/pandas/test_files/normal.csv | 6 - .../unit_tests/pandas/test_files/not_csv.csv | 4 - .../pandas/test_files/over_number.csv | 2 - .../pandas/test_files/string_data.csv | 2 - .../tests/unit_tests/pandas/test_parser.py | 154 +++++------------- 11 files changed, 43 insertions(+), 172 deletions(-) delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/bitvector.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/diff_col.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/edge_data.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/empty.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/none.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/normal.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/not_csv.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/over_number.csv delete mode 100644 packages/client/libclient-py/tests/unit_tests/pandas/test_files/string_data.csv diff --git a/packages/client/libclient-py/quickmpc/pandas/parser.py b/packages/client/libclient-py/quickmpc/pandas/parser.py index 0f32d5bdd..02399688d 100644 --- a/packages/client/libclient-py/quickmpc/pandas/parser.py +++ b/packages/client/libclient-py/quickmpc/pandas/parser.py @@ -78,22 +78,6 @@ def to_float_for_matching(val: Union[str, int]) -> float: return val_float -def to_float(val: str) -> float: - """ If val is a float, convert as is; if it is a string, hash it. """ - try: - return float(val) - except ValueError: - # k,m are constants used in the comparison operation - # Due to the limitation of comparison operation, - # k bits are taken out and divided by 2^m. - k: int = 48 - m: int = 20 - hs: str = sha512(val.encode()).hexdigest() - val_int: int = int(hs[:(k >> 2)], 16) - val_float: float = val_int / pow(2, m) - return val_float - - def to_int(val: str, encoding='utf-8') -> int: encoded = val.encode(encoding) return int.from_bytes(encoded, byteorder='big') @@ -145,11 +129,11 @@ def find_types(schema: List[str], def convert(element: str, type_info: ShareValueTypeEnum.ValueType) -> ShareValueType: if type_info == ShareValueTypeEnum.Value('SHARE_VALUE_TYPE_FIXED_POINT'): - return to_float(element) + return float(element) if type_info == ShareValueTypeEnum.Value( 'SHARE_VALUE_TYPE_UTF_8_INTEGER_REPRESENTATION'): return to_int(element) - return to_float(element) + return float(element) def parse(data: List[List[str]], matching_column: Optional[int] = None) \ @@ -158,7 +142,6 @@ def parse(data: List[List[str]], matching_column: Optional[int] = None) \ types = find_types(schema_name, data[1:], matching_column) schema = [Schema(name=name, type=type) for name, type in zip(schema_name, types)] - # check size first because an iterator which `zip` bultin function returns # stops when the shortest iterable is exhausted if not FormatChecker.check_size(data[1:], schema): @@ -172,12 +155,3 @@ def parse(data: List[List[str]], matching_column: Optional[int] = None) \ raise RuntimeError("規定されたフォーマットでないデータです.") return secrets, schema - - -def parse_csv( - filename: str, matching_column: Optional[int] = None) \ - -> Tuple[List[List[ShareValueType]], List[Schema]]: - with open(filename) as f: - reader = csv.reader(f) - text: List[List[str]] = [row for row in reader] - return parse(text, matching_column) diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/bitvector.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/bitvector.csv deleted file mode 100644 index 91dbc7d87..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/bitvector.csv +++ /dev/null @@ -1,5 +0,0 @@ -id,attr1,attr2,attr3 -hoge,0,1,3 -huga,0,2,1 -moge,1,0,4 -moga,0,1,2 diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/diff_col.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/diff_col.csv deleted file mode 100644 index 9131dae09..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/diff_col.csv +++ /dev/null @@ -1,6 +0,0 @@ -id,attr1,attr2,attr3,attr5,attr6 -hoge,0,0.77,0.63,0.35,0.39,0.35 -huga,0,0.37,0.36,0.43,0.41,0.39 -piyo,1,0.34,0.34,0.44,0.50,0.32 -moge,1,0.47,0.43,0.34,0.29,0.34 -0,0.67,0.41,0.25,0.49,0.25 diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/edge_data.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/edge_data.csv deleted file mode 100644 index c457363fa..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/edge_data.csv +++ /dev/null @@ -1,2 +0,0 @@ -id,zero,int_max,int_min,float_min_plus,float_max_minus,string_max -0,0,10000000000,-10000000000,0.00000000001,-0.00000000001,漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢 diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/empty.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/empty.csv deleted file mode 100644 index 074d1eeb4..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/empty.csv +++ /dev/null @@ -1 +0,0 @@ -id diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/none.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/none.csv deleted file mode 100644 index baab9dd16..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/none.csv +++ /dev/null @@ -1,3 +0,0 @@ -id,n1,n2 -0,,10 - diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/normal.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/normal.csv deleted file mode 100644 index caceee235..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/normal.csv +++ /dev/null @@ -1,6 +0,0 @@ -id,attr1,attr2,attr3,attr4,attr5,attr6 -hoge,0,0.77,0.63,0.35,0.39,0.35 -huga,0,0.37,0.36,0.43,0.41,0.39 -piyo,1,0.34,0.34,0.44,0.50,0.32 -moge,1,0.47,0.43,0.34,0.29,0.34 -moga,0,0.67,0.41,0.25,0.49,0.25 diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/not_csv.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/not_csv.csv deleted file mode 100644 index aa6195e63..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/not_csv.csv +++ /dev/null @@ -1,4 +0,0 @@ -{ - "name": "I am json.", - "type": "json" -} diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/over_number.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/over_number.csv deleted file mode 100644 index 7830e1d40..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/over_number.csv +++ /dev/null @@ -1,2 +0,0 @@ -id,number -0,13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006084097 diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/string_data.csv b/packages/client/libclient-py/tests/unit_tests/pandas/test_files/string_data.csv deleted file mode 100644 index c2cba4ffd..000000000 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_files/string_data.csv +++ /dev/null @@ -1,2 +0,0 @@ -id,alphabet,hiragana,katakana,chinese_characters,large_number,emoji -0,abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ,ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ,ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ,春眠不覚暁処処聞啼鳥夜来風雨声花落知多少,1234567890,😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺🙂🤗🤩🤔🤨😐😑😶🙄😏😣😥😮🤐😯😪😫😴😌😛😜😝🤤😒😓😔😕🙃🤑😲☹🙁😖😞😟😤😢😭😦😧😨😩🤯😬😰😱😳🤪😵😡😠🤬😷🤒🤕🤢🤮🤧😇🤠🤡🤥🤫🤭🧐🤓 diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py b/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py index dfe922f07..8b3c12364 100644 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py +++ b/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from quickmpc.pandas.parser import parse, parse_csv, to_float_for_matching +from quickmpc.pandas.parser import parse, to_float_for_matching from quickmpc.proto.common_types.common_types_pb2 import (Schema, ShareValueTypeEnum) @@ -21,96 +21,21 @@ def schema_int(name: str): .SHARE_VALUE_TYPE_UTF_8_INTEGER_REPRESENTATION) -# 元データ -normal_data: List[List[str]] = [s.split(",") for s in [ - "id,attr1,attr2,attr3,attr4,attr5,attr6", - "hoge,0,0.77,0.63,0.35,0.39,0.35", - "huga,0,0.37,0.36,0.43,0.41,0.39", - "piyo,1,0.34,0.34,0.44,0.50,0.32", - "moge,1,0.47,0.43,0.34,0.29,0.34", - "moga,0,0.67,0.41,0.25,0.49,0.25", -]] -data3: List[List[str]] = [s.split(",") for s in [ - "id,id:id", - "hoge,hoge", - "huga,huga", - "moge,moge", - "moga,moga", -]] - -# 正しくparseされたデータ -d1_schema_str: List[str] = ['id', 'attr1', 'attr2', - 'attr3', 'attr4', 'attr5', 'attr6'] -d1_schema: List[Schema] = [schema_fp(name) for name in d1_schema_str] -d1_secrets: List[List[float]] = [ - [230379555.4797964, 0, 0.77, 0.63, 0.35, 0.39, 0.35], - [10723675.973257065, 0, 0.37, 0.36, 0.43, 0.41, 0.39], - [117576607.23670769, 1, 0.34, 0.34, 0.44, 0.5, 0.32], - [211114761.8482437, 1, 0.47, 0.43, 0.34, 0.29, 0.34], - [13292676.303739548, 0, 0.67, 0.41, 0.25, 0.49, 0.25] -] - -d2_schema_str: List[str] = ['id#0', 'attr1#0', 'attr1#1', - 'attr2#0', 'attr2#1', 'attr2#2', - 'attr3#0', 'attr3#1', 'attr3#2', 'attr3#3'] -d2_schema: List[Schema] = [schema_fp(name) for name in d2_schema_str] -d2_secrets: List[List[float]] = [ - [230379555.4797964, 1, 0, 1, 0, 0, 1, 0, 0, 0], - [10723675.973257065, 1, 0, 0, 1, 0, 0, 1, 0, 0], - [211114761.8482437, 0, 1, 0, 0, 1, 0, 0, 1, 0], - [13292676.303739548, 1, 0, 1, 0, 0, 0, 0, 0, 1] -] - -d3_schema: List[Schema] = [schema_int('id'), schema_fp('id:id')] -d3_secrets: List[List[float]] = [ - [1752131429, 230379555.4797964], - [1752524641, 10723675.973257065], - [1836017509, 211114761.8482437], - [1836017505, 13292676.303739548] -] - - -def test_parse(): - """ 正しくパースできるかTest """ - secrets, schema = parse(normal_data, matching_column=1) - assert (np.allclose(secrets, d1_secrets)) - assert (schema == d1_schema) - - -def test_parse_str(): - secrets, schema = parse(data3) - assert (np.allclose(secrets, d3_secrets)) - assert (schema == d3_schema) - - -def test_parse_errorhandring(): - """ 異常値を与えてエラーが出るかTest """ - with pytest.raises(Exception): - # 行が足りずシェアがない - parse([["id", "a", "b", "c"]]) - with pytest.raises(Exception): - # schemaに同じものが含まれる - parse([["id", "a", "a"], - ["id1", "1", "2"], - ["id2", "3", "4"]]) - with pytest.raises(Exception): - # 正方行列でない - parse([["id", "a", "b"], - ["id1", "1", "2"], - ["id2", "3", "4", "5"]]) - parse([["id", "a", "b"], - ["id1", "1", "2"], - ["id2"]]) - - @pytest.mark.parametrize( - ("csv_file", "expected_secrets", "expected_schema"), + ("data", "expected_secrets", "expected_schema"), [ - # 動作確認 - ("normal.csv", d1_secrets, d1_schema), - - # エッジケース - ("edge_data.csv", + # 通常case + ([["id", "a", "b"], ["1.0", "0", "0.77"], ["2.0", "0", "0.37"]], + [[1.0, 0, 0.77], [2.0, 0, 0.37]], + [schema_fp(name) for name in ["id", "a", "b"]]), + # tag付き + ([["id", "id:id"], ["str", "2.0"]], + [[7566450, 2.0]], + [schema_int('id'), schema_fp('id:id')]), + + # edge case + ([["id", "zero", "int_max", "int_min", "float_min_plus", "float_max_minus", "string_max"], + ["0", "0", "10000000000", "-10000000000", "0.00000000001", "-0.00000000001", "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"]], [[0.0, 0.0, 10000000000.0, -10000000000.0, 1e-11, -1e-11, int("81294350169683468997949680580862592771577912922072" "36632400050104509615137476244113137539228236962890" @@ -140,7 +65,9 @@ def test_parse_errorhandring(): schema_fp('float_max_minus'), schema_int('string_max'), ]), # 文字列 - ("string_data.csv", + ([["id", "alphabet", "hiragana", "katakana", "chinese_characters", "large_number", "emoji"], + ["0", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ", + "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ", "春眠不覚暁処処聞啼鳥夜来風雨声花落知多少", "1234567890", "😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺🙂🤗🤩🤔🤨😐😑😶🙄😏😣😥😮🤐😯😪😫😴😌😛😜😝🤤😒😓😔😕🙃🤑😲☹🙁😖😞😟😤😢😭😦😧😨😩🤯😬😰😱😳🤪😵😡😠🤬😷🤒🤕🤢🤮🤧😇🤠🤡🤥🤫🤭🧐🤓"]], [[0.0, int("64376492020959182960102910068921920137578270955447" "00994229324997296606723909755334743502970718979797" @@ -197,14 +124,11 @@ def test_parse_errorhandring(): schema_int('katakana'), schema_int('chinese_characters'), schema_fp('large_number'), # TODO: 文字列として解釈してほしい schema_int('emoji'), - ]), + ]), ] ) -def test_parse_csv(csv_file, expected_secrets, expected_schema): - """ csvを正しくパースできるかTest """ - secrets, schema = parse_csv( - f"{os.path.dirname(__file__)}/test_files/{csv_file}", - matching_column=1) +def test_parse(data, expected_secrets, expected_schema): + secrets, schema = parse(data) for row, row_expected in zip(secrets, expected_secrets): for x, y in zip(row, row_expected): if type(x) == int: @@ -215,28 +139,32 @@ def test_parse_csv(csv_file, expected_secrets, expected_schema): @pytest.mark.parametrize( - ("csv_file", "expected_exception"), + ("data", "expected_exception"), [ - # ファイルが存在しない - ("hoge", Exception), - - # 列数が異なる - ("diff_col.csv", Exception), - - # テーブルが空 - ("empty.csv", Exception), - - # 空のデータが存在する - ("none.csv", Exception), - - # csv形式じゃない - ("not_csv.csv", Exception), + # 行が足りずシェアがない + ([["id", "a", "b", "c"]], RuntimeError), + # schemaに同じものが含まれる + ([["id", "a", "a"], + ["id1", "1", "2"], + ["id2", "3", "4"]], RuntimeError), + # 正方行列でない + ([["id", "a", "b"], + ["id1", "1", "2"], + ["id2", "3", "4", "5"]], RuntimeError), + ([["id", "a", "b"], + ["id1", "1", "2"], + ["id2"]], RuntimeError), + # tableが空 + ([["id"]], RuntimeError), + # csv形式でない + ({"name": "I am json"}, KeyError), ] + ) -def test_parse_csv_errorhandring(csv_file, expected_exception): +def test_parse_errorhandring(data, expected_exception): """ 異常値を与えてエラーが出るかTest """ with pytest.raises(expected_exception): - parse_csv(f"{os.path.dirname(__file__)}/test_files/{csv_file}") + parse(data) @pytest.mark.parametrize( From 3d44bfc9ee905568fba2abfef9ffb2a4d0d23299 Mon Sep 17 00:00:00 2001 From: Nakata Date: Mon, 25 Sep 2023 18:05:28 +0900 Subject: [PATCH 4/7] Fix flake8 --- .../libclient-py/quickmpc/pandas/parser.py | 1 - .../tests/unit_tests/pandas/test_parser.py | 34 ++++++++++++++----- .../tests/unit_tests/pandas/test_reader.py | 15 +++++--- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/packages/client/libclient-py/quickmpc/pandas/parser.py b/packages/client/libclient-py/quickmpc/pandas/parser.py index 02399688d..e108ae77a 100644 --- a/packages/client/libclient-py/quickmpc/pandas/parser.py +++ b/packages/client/libclient-py/quickmpc/pandas/parser.py @@ -1,4 +1,3 @@ -import csv from dataclasses import dataclass from hashlib import sha512 from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py b/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py index 8b3c12364..970dbd6c9 100644 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py +++ b/packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py @@ -1,8 +1,5 @@ import math -import os -from typing import List -import numpy as np import pytest from quickmpc.pandas.parser import parse, to_float_for_matching @@ -34,8 +31,15 @@ def schema_int(name: str): [schema_int('id'), schema_fp('id:id')]), # edge case - ([["id", "zero", "int_max", "int_min", "float_min_plus", "float_max_minus", "string_max"], - ["0", "0", "10000000000", "-10000000000", "0.00000000001", "-0.00000000001", "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"]], + ([["id", "zero", "int_max", "int_min", + "float_min_plus", "float_max_minus", "string_max"], + ["0", "0", + "10000000000", "-10000000000", "0.00000000001", "-0.00000000001", + "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢" + "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢" + "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢" + "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢" + "漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"]], [[0.0, 0.0, 10000000000.0, -10000000000.0, 1e-11, -1e-11, int("81294350169683468997949680580862592771577912922072" "36632400050104509615137476244113137539228236962890" @@ -65,9 +69,20 @@ def schema_int(name: str): schema_fp('float_max_minus'), schema_int('string_max'), ]), # 文字列 - ([["id", "alphabet", "hiragana", "katakana", "chinese_characters", "large_number", "emoji"], - ["0", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ", - "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ", "春眠不覚暁処処聞啼鳥夜来風雨声花落知多少", "1234567890", "😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺🙂🤗🤩🤔🤨😐😑😶🙄😏😣😥😮🤐😯😪😫😴😌😛😜😝🤤😒😓😔😕🙃🤑😲☹🙁😖😞😟😤😢😭😦😧😨😩🤯😬😰😱😳🤪😵😡😠🤬😷🤒🤕🤢🤮🤧😇🤠🤡🤥🤫🤭🧐🤓"]], + ([["id", "alphabet", "hiragana", "katakana", + "chinese_characters", "large_number", "emoji"], + ["0", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", + "ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそ" + "ぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺ" + "ほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ", + "ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソ" + "ゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペ" + "ホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ", + "春眠不覚暁処処聞啼鳥夜来風雨声花落知多少", + "1234567890", + "😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺🙂🤗🤩🤔🤨😐😑😶🙄😏😣😥" + "😮🤐😯😪😫😴😌😛😜😝🤤😒😓😔😕🙃🤑😲☹🙁😖😞😟😤😢😭😦😧😨😩" + "🤯😬😰😱😳🤪😵😡😠🤬😷🤒🤕🤢🤮🤧😇🤠🤡🤥🤫🤭🧐🤓"]], [[0.0, int("64376492020959182960102910068921920137578270955447" "00994229324997296606723909755334743502970718979797" @@ -173,7 +188,8 @@ def test_parse_errorhandring(data, expected_exception): # 文字列 ("id1", 125382372.3739109), ("hogehuga@gmail.com", 86705962.83638954), - ("very_very_very_very_very_very_very_very_very_long_string", 190655731.5899248), + ("very_very_very_very_very_very_very_very_very_long_string", + 190655731.5899248), ("日本語の文字列デスヨ", 40972936.22852039), ("❗️✨🤟😁👍感謝❗️🙌✨感謝❗️🙌✨❗️🍖😋🍴✨", 62936327.66452408), ("", 217595411.34348965), diff --git a/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py b/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py index 7ad830e0f..e47d3adc0 100644 --- a/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py +++ b/packages/client/libclient-py/tests/unit_tests/pandas/test_reader.py @@ -24,18 +24,23 @@ def to_string_io(data: List[List]) -> io.StringIO: columns=["id", "c", "__qmpc_sort_index__"])), # 1列目以外をID列に指定した場合 ([["id", "c"], ["a", 1], ["b", 2]], "c", - pd.DataFrame([["a", 81786090.20335388, 1], ["b", 67839041.07183933, 0]], + pd.DataFrame([["a", 81786090.20335388, 1], + ["b", 67839041.07183933, 0]], columns=["id", "c", "__qmpc_sort_index__"])), # ID列が巨大な整数 ([["id"], [1000000000000000000000000]], "id", - pd.DataFrame([[250298887.90448284, 0]], columns=["id", "__qmpc_sort_index__"])), + pd.DataFrame([[250298887.90448284, 0]], + columns=["id", "__qmpc_sort_index__"])), ([["id"], [-1000000000000000000000000]], "id", - pd.DataFrame([[146263071.2934265, 0]], columns=["id", "__qmpc_sort_index__"])), + pd.DataFrame([[146263071.2934265, 0]], + columns=["id", "__qmpc_sort_index__"])), # ID列が巨大な実数 ([["id"], [1000000000000000000000000.1111111111111111]], "id", - pd.DataFrame([[108235191.58269978, 0]], columns=["id", "__qmpc_sort_index__"])), + pd.DataFrame([[108235191.58269978, 0]], + columns=["id", "__qmpc_sort_index__"])), ([["id"], [-1000000000000000000000000.1111111111111111]], "id", - pd.DataFrame([[264110776.18418598, 0]], columns=["id", "__qmpc_sort_index__"])), + pd.DataFrame([[264110776.18418598, 0]], + columns=["id", "__qmpc_sort_index__"])), ] ) def test_read_csv(data, index_col, expected, From 2477b39ab56ca78511d462faf5f20de32df8a450 Mon Sep 17 00:00:00 2001 From: Nakata Date: Mon, 25 Sep 2023 18:19:34 +0900 Subject: [PATCH 5/7] Update libc readmee --- packages/client/libclient-py/README-ja.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/client/libclient-py/README-ja.md b/packages/client/libclient-py/README-ja.md index 131bd3f34..e09b40a2a 100644 --- a/packages/client/libclient-py/README-ja.md +++ b/packages/client/libclient-py/README-ja.md @@ -39,3 +39,8 @@ $ pipenv run make fmt 3.7.10 テストは3.7,3.8,3.9でされているため,いずれも動作する. + +## 入力仕様 +`qpd.read_csv`で受け取るCSVデータではあらゆる数値,文字列を入力として受け付け, +数値は64bit浮動小数,文字列は任意制度整数として扱う. +そのため,64bit浮動小数で表現できない数値については精度が保証されない.特にマッチングで使用する列の場合は全く異なる値にparseされる可能性があるため,文字列として入力することを推奨する. From 527252b5cea5e6ce1d07dd9f908282a125143f8d Mon Sep 17 00:00:00 2001 From: Nakata Date: Mon, 25 Sep 2023 18:25:04 +0900 Subject: [PATCH 6/7] Update libc readme en --- packages/client/libclient-py/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/client/libclient-py/README.md b/packages/client/libclient-py/README.md index ba73c288f..26dc47772 100644 --- a/packages/client/libclient-py/README.md +++ b/packages/client/libclient-py/README.md @@ -34,3 +34,7 @@ $ pipenv run make fmt 3.7.10 Tests have been done with 3.7, 3.8, and 3.9, so they all work. + +## Input specifications +The CSV data received with `qpd.read_csv` accepts any number or character string as input, and numbers are treated as 64-bit floating point numbers and character strings are treated as arbitrary system integers. +Therefore, the precision of numbers that cannot be expressed as 64-bit floating point numbers is not guaranteed. Especially in the case of columns used for matching, it is recommended to input them as strings. From b9d19b1ab6ace3a14b554c629545a30ad5261852 Mon Sep 17 00:00:00 2001 From: Nakata Date: Mon, 25 Sep 2023 18:51:27 +0900 Subject: [PATCH 7/7] Fix send_share test --- scripts/libclient/src/tests/test_send_share.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/libclient/src/tests/test_send_share.py b/scripts/libclient/src/tests/test_send_share.py index 218198201..fc19fbad9 100644 --- a/scripts/libclient/src/tests/test_send_share.py +++ b/scripts/libclient/src/tests/test_send_share.py @@ -9,8 +9,6 @@ [ (data_frame([[1, 2], [3, 4]], columns=["s1", "s2"])), (data_frame([[1]], columns=["s1"])), - (data_frame([["a", 1], ["b", 2]], columns=["id", "s1"])), - (data_frame([["a", "x"], ["b", "y"]], columns=["id", "xy"])), ] ) def test_send_share_from_csv_data(df: pd.DataFrame): @@ -23,8 +21,6 @@ def test_send_share_from_csv_data(df: pd.DataFrame): [ (data_frame([[1, 2], [3, 4]], columns=["s1", "s2"])), (data_frame([[1]], columns=["s1"])), - (data_frame([["a", 1], ["b", 2]], columns=["id", "s1"])), - (data_frame([["a", "x"], ["b", "y"]], columns=["id", "xy"])), ] ) def test_load_data_id(df: pd.DataFrame):