Skip to content
This repository was archived by the owner on Mar 25, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions packages/client/libclient-py/README-ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,8 @@ $ pipenv run make fmt
3.7.10

テストは3.7,3.8,3.9でされているため,いずれも動作する.

## 入力仕様
`qpd.read_csv`で受け取るCSVデータではあらゆる数値,文字列を入力として受け付け,
数値は64bit浮動小数,文字列は任意制度整数として扱う.
そのため,64bit浮動小数で表現できない数値については精度が保証されない.特にマッチングで使用する列の場合は全く異なる値にparseされる可能性があるため,文字列として入力することを推奨する.
4 changes: 4 additions & 0 deletions packages/client/libclient-py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,7 @@ $ pipenv run make fmt
3.7.10

Tests have been done with 3.7, 3.8, and 3.9, so they all work.

## Input specifications
The CSV data received with `qpd.read_csv` accepts any number or character string as input, and numbers are treated as 64-bit floating point numbers and character strings are treated as arbitrary system integers.
Therefore, the precision of numbers that cannot be expressed as 64-bit floating point numbers is not guaranteed. Especially in the case of columns used for matching, it is recommended to input them as strings.
39 changes: 12 additions & 27 deletions packages/client/libclient-py/quickmpc/pandas/parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import csv
from dataclasses import dataclass
from hashlib import sha512
from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
Expand Down Expand Up @@ -66,20 +65,16 @@ def format_check(secrets: List[List[ShareValueType]],
return True


def to_float(val: str) -> float:
""" If val is a float, convert as is; if it is a string, hash it. """
try:
return float(val)
except ValueError:
# k,m are constants used in the comparison operation
# Due to the limitation of comparison operation,
# k bits are taken out and divided by 2^m.
k: int = 48
m: int = 20
hs: str = sha512(val.encode()).hexdigest()
val_int: int = int(hs[:(k >> 2)], 16)
val_float: float = val_int / pow(2, m)
return val_float
def to_float_for_matching(val: Union[str, int]) -> float:
# k,m are constants used in the comparison operation
# Due to the limitation of comparison operation,
# k bits are taken out and divided by 2^m.
k: int = 48
m: int = 20
hs: str = sha512(str(val).encode()).hexdigest()
val_int: int = int(hs[:(k >> 2)], 16)
val_float: float = val_int / pow(2, m)
return val_float


def to_int(val: str, encoding='utf-8') -> int:
Expand Down Expand Up @@ -133,11 +128,11 @@ def find_types(schema: List[str],
def convert(element: str,
type_info: ShareValueTypeEnum.ValueType) -> ShareValueType:
if type_info == ShareValueTypeEnum.Value('SHARE_VALUE_TYPE_FIXED_POINT'):
return to_float(element)
return float(element)
if type_info == ShareValueTypeEnum.Value(
'SHARE_VALUE_TYPE_UTF_8_INTEGER_REPRESENTATION'):
return to_int(element)
return to_float(element)
return float(element)


def parse(data: List[List[str]], matching_column: Optional[int] = None) \
Expand All @@ -146,7 +141,6 @@ def parse(data: List[List[str]], matching_column: Optional[int] = None) \
types = find_types(schema_name, data[1:], matching_column)
schema = [Schema(name=name, type=type)
for name, type in zip(schema_name, types)]

# check size first because an iterator which `zip` bultin function returns
# stops when the shortest iterable is exhausted
if not FormatChecker.check_size(data[1:], schema):
Expand All @@ -160,12 +154,3 @@ def parse(data: List[List[str]], matching_column: Optional[int] = None) \
raise RuntimeError("規定されたフォーマットでないデータです.")

return secrets, schema


def parse_csv(
filename: str, matching_column: Optional[int] = None) \
-> Tuple[List[List[ShareValueType]], List[Schema]]:
with open(filename) as f:
reader = csv.reader(f)
text: List[List[str]] = [row for row in reader]
return parse(text, matching_column)
4 changes: 2 additions & 2 deletions packages/client/libclient-py/quickmpc/pandas/readers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd

from quickmpc.pandas.parser import to_float
from quickmpc.pandas.parser import to_float_for_matching


def read_csv(*args, index_col: str, **kwargs) -> pd.DataFrame:
Expand All @@ -23,7 +23,7 @@ def read_csv(*args, index_col: str, **kwargs) -> pd.DataFrame:
"""
df = pd.read_csv(*args, **kwargs)
# ID列を数値化
df[index_col] = df[index_col].map(lambda x: to_float(x))
df[index_col] = df[index_col].map(lambda x: to_float_for_matching(x))
# send_share時にID列でsortできる様にID列を座標圧縮した列を追加する
df["__qmpc_sort_index__"] = df.index
df = df.sort_values(by=index_col)
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

196 changes: 81 additions & 115 deletions packages/client/libclient-py/tests/unit_tests/pandas/test_parser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
import math
import os
from typing import List

import numpy as np
import pytest

from quickmpc.pandas.parser import parse, parse_csv
from quickmpc.pandas.parser import parse, to_float_for_matching
from quickmpc.proto.common_types.common_types_pb2 import (Schema,
ShareValueTypeEnum)

Expand All @@ -21,96 +18,28 @@ def schema_int(name: str):
.SHARE_VALUE_TYPE_UTF_8_INTEGER_REPRESENTATION)


# 元データ
normal_data: List[List[str]] = [s.split(",") for s in [
"id,attr1,attr2,attr3,attr4,attr5,attr6",
"hoge,0,0.77,0.63,0.35,0.39,0.35",
"huga,0,0.37,0.36,0.43,0.41,0.39",
"piyo,1,0.34,0.34,0.44,0.50,0.32",
"moge,1,0.47,0.43,0.34,0.29,0.34",
"moga,0,0.67,0.41,0.25,0.49,0.25",
]]
data3: List[List[str]] = [s.split(",") for s in [
"id,id:id",
"hoge,hoge",
"huga,huga",
"moge,moge",
"moga,moga",
]]

# 正しくparseされたデータ
d1_schema_str: List[str] = ['id', 'attr1', 'attr2',
'attr3', 'attr4', 'attr5', 'attr6']
d1_schema: List[Schema] = [schema_fp(name) for name in d1_schema_str]
d1_secrets: List[List[float]] = [
[230379555.4797964, 0, 0.77, 0.63, 0.35, 0.39, 0.35],
[10723675.973257065, 0, 0.37, 0.36, 0.43, 0.41, 0.39],
[117576607.23670769, 1, 0.34, 0.34, 0.44, 0.5, 0.32],
[211114761.8482437, 1, 0.47, 0.43, 0.34, 0.29, 0.34],
[13292676.303739548, 0, 0.67, 0.41, 0.25, 0.49, 0.25]
]

d2_schema_str: List[str] = ['id#0', 'attr1#0', 'attr1#1',
'attr2#0', 'attr2#1', 'attr2#2',
'attr3#0', 'attr3#1', 'attr3#2', 'attr3#3']
d2_schema: List[Schema] = [schema_fp(name) for name in d2_schema_str]
d2_secrets: List[List[float]] = [
[230379555.4797964, 1, 0, 1, 0, 0, 1, 0, 0, 0],
[10723675.973257065, 1, 0, 0, 1, 0, 0, 1, 0, 0],
[211114761.8482437, 0, 1, 0, 0, 1, 0, 0, 1, 0],
[13292676.303739548, 1, 0, 1, 0, 0, 0, 0, 0, 1]
]

d3_schema: List[Schema] = [schema_int('id'), schema_fp('id:id')]
d3_secrets: List[List[float]] = [
[1752131429, 230379555.4797964],
[1752524641, 10723675.973257065],
[1836017509, 211114761.8482437],
[1836017505, 13292676.303739548]
]


def test_parse():
""" 正しくパースできるかTest """
secrets, schema = parse(normal_data, matching_column=1)
assert (np.allclose(secrets, d1_secrets))
assert (schema == d1_schema)


def test_parse_str():
secrets, schema = parse(data3)
assert (np.allclose(secrets, d3_secrets))
assert (schema == d3_schema)


def test_parse_errorhandring():
""" 異常値を与えてエラーが出るかTest """
with pytest.raises(Exception):
# 行が足りずシェアがない
parse([["id", "a", "b", "c"]])
with pytest.raises(Exception):
# schemaに同じものが含まれる
parse([["id", "a", "a"],
["id1", "1", "2"],
["id2", "3", "4"]])
with pytest.raises(Exception):
# 正方行列でない
parse([["id", "a", "b"],
["id1", "1", "2"],
["id2", "3", "4", "5"]])
parse([["id", "a", "b"],
["id1", "1", "2"],
["id2"]])


@pytest.mark.parametrize(
("csv_file", "expected_secrets", "expected_schema"),
("data", "expected_secrets", "expected_schema"),
[
# 動作確認
("normal.csv", d1_secrets, d1_schema),

# エッジケース
("edge_data.csv",
# 通常case
([["id", "a", "b"], ["1.0", "0", "0.77"], ["2.0", "0", "0.37"]],
[[1.0, 0, 0.77], [2.0, 0, 0.37]],
[schema_fp(name) for name in ["id", "a", "b"]]),
# tag付き
([["id", "id:id"], ["str", "2.0"]],
[[7566450, 2.0]],
[schema_int('id'), schema_fp('id:id')]),

# edge case
([["id", "zero", "int_max", "int_min",
"float_min_plus", "float_max_minus", "string_max"],
["0", "0",
"10000000000", "-10000000000", "0.00000000001", "-0.00000000001",
"漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
"漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
"漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
"漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"
"漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢漢"]],
[[0.0, 0.0, 10000000000.0, -10000000000.0, 1e-11, -1e-11,
int("81294350169683468997949680580862592771577912922072"
"36632400050104509615137476244113137539228236962890"
Expand Down Expand Up @@ -140,7 +69,20 @@ def test_parse_errorhandring():
schema_fp('float_max_minus'), schema_int('string_max'), ]),

# 文字列
("string_data.csv",
([["id", "alphabet", "hiragana", "katakana",
"chinese_characters", "large_number", "emoji"],
["0", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
"ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそ"
"ぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺ"
"ほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ",
"ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソ"
"ゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペ"
"ホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ",
"春眠不覚暁処処聞啼鳥夜来風雨声花落知多少",
"1234567890",
"😀😁😂🤣😃😄😅😆😉😊😋😎😍😘😗😙😚☺🙂🤗🤩🤔🤨😐😑😶🙄😏😣😥"
"😮🤐😯😪😫😴😌😛😜😝🤤😒😓😔😕🙃🤑😲☹🙁😖😞😟😤😢😭😦😧😨😩"
"🤯😬😰😱😳🤪😵😡😠🤬😷🤒🤕🤢🤮🤧😇🤠🤡🤥🤫🤭🧐🤓"]],
[[0.0,
int("64376492020959182960102910068921920137578270955447"
"00994229324997296606723909755334743502970718979797"
Expand Down Expand Up @@ -197,14 +139,11 @@ def test_parse_errorhandring():
schema_int('katakana'), schema_int('chinese_characters'),
schema_fp('large_number'), # TODO: 文字列として解釈してほしい
schema_int('emoji'),
]),
]),
]
)
def test_parse_csv(csv_file, expected_secrets, expected_schema):
""" csvを正しくパースできるかTest """
secrets, schema = parse_csv(
f"{os.path.dirname(__file__)}/test_files/{csv_file}",
matching_column=1)
def test_parse(data, expected_secrets, expected_schema):
secrets, schema = parse(data)
for row, row_expected in zip(secrets, expected_secrets):
for x, y in zip(row, row_expected):
if type(x) == int:
Expand All @@ -215,25 +154,52 @@ def test_parse_csv(csv_file, expected_secrets, expected_schema):


@pytest.mark.parametrize(
("csv_file", "expected_exception"),
("data", "expected_exception"),
[
# ファイルが存在しない
("hoge", Exception),

# 列数が異なる
("diff_col.csv", Exception),
# 行が足りずシェアがない
([["id", "a", "b", "c"]], RuntimeError),
# schemaに同じものが含まれる
([["id", "a", "a"],
["id1", "1", "2"],
["id2", "3", "4"]], RuntimeError),
# 正方行列でない
([["id", "a", "b"],
["id1", "1", "2"],
["id2", "3", "4", "5"]], RuntimeError),
([["id", "a", "b"],
["id1", "1", "2"],
["id2"]], RuntimeError),
# tableが空
([["id"]], RuntimeError),
# csv形式でない
({"name": "I am json"}, KeyError),
]

# テーブルが空
("empty.csv", Exception),
)
def test_parse_errorhandring(data, expected_exception):
""" 異常値を与えてエラーが出るかTest """
with pytest.raises(expected_exception):
parse(data)

# 空のデータが存在する
("none.csv", Exception),

# csv形式じゃない
("not_csv.csv", Exception),
@pytest.mark.parametrize(
("val", "expected"),
[
# 文字列
("id1", 125382372.3739109),
("hogehuga@gmail.com", 86705962.83638954),
("very_very_very_very_very_very_very_very_very_long_string",
190655731.5899248),
("日本語の文字列デスヨ", 40972936.22852039),
("❗️✨🤟😁👍感謝❗️🙌✨感謝❗️🙌✨❗️🍖😋🍴✨", 62936327.66452408),
("", 217595411.34348965),
# 数値
(0, 52152834.036356926),
(1, 81786090.20335388),
(1000000000000000000000, 35709723.87166405),
(1.1, 199269155.31771374),
(1000000000000000000000.11111111111111111111, 19815069.039226532),
]
)
def test_parse_csv_errorhandring(csv_file, expected_exception):
""" 異常値を与えてエラーが出るかTest """
with pytest.raises(expected_exception):
parse_csv(f"{os.path.dirname(__file__)}/test_files/{csv_file}")
def test_to_float_for_maching(val, expected):
assert math.isclose(to_float_for_matching(val), expected)
Loading