python/graphscope/framework/utils.py (598 lines of code) (raw):
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2020 Alibaba Group Holding Limited. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import os
import platform
import random
import re
import shutil
import socket
import string
import subprocess
import tempfile
import threading
import time
import warnings
from queue import Empty
from queue import Queue
import numpy as np
import pandas as pd
import psutil
from google.protobuf.any_pb2 import Any
from graphscope.client.archive import OutArchive
from graphscope.framework.errors import check_argument
from graphscope.proto import attr_value_pb2
from graphscope.proto import data_types_pb2
from graphscope.proto import graph_def_pb2
from graphscope.proto import types_pb2
class PipeWatcher(object):
def __init__(self, pipe, sink, queue=None, drop=True, suppressed=False):
"""Watch a pipe, and buffer its output if drop is False."""
self._pipe = pipe
self._sink = sink
self._drop = drop
self._suppressed = suppressed
self._filters = []
if queue is None:
self._lines = Queue()
else:
self._lines = queue
def read_and_poll(self):
for line in self._pipe:
if self._filter(line) and not self._suppressed:
try:
self._sink.write(line)
self._sink.flush()
if not self._drop:
self._lines.put(line)
except: # noqa: E722, pylint: disable=bare-except
pass
self._polling_thread = threading.Thread(target=read_and_poll, args=(self,))
self._polling_thread.daemon = True
self._polling_thread.start()
def poll(self, block=True, timeout=None):
return self._lines.get(block=block, timeout=timeout)
def poll_all(self):
while True:
try:
yield self._lines.get(block=False)
except Empty:
break
def drop(self, drop=True):
self._drop = drop
def suppress(self, suppressed=True):
self._suppressed = suppressed
def add_filter(self, func):
if not (func in self._filters):
self._filters.append(func)
def _filter(self, line):
for func in self._filters:
if not func(line): # assume callable - will raise if not
return False
return True
class PipeMerger(object):
def __init__(self, pipe1, pipe2):
self._queue = Queue()
self._stop = False
def read_and_pool(self, tag, pipe, target: Queue):
while True:
try:
msg = (pipe.poll(), "")
target.put(msg if tag == "out" else msg[::-1])
except Exception:
time.sleep(1)
if self._stop:
break
self._pipe1_thread = threading.Thread(
target=read_and_pool, args=(self, "out", pipe1, self._queue)
)
self._pipe1_thread.daemon = True
self._pipe2_thread = threading.Thread(
target=read_and_pool, args=(self, "err", pipe2, self._queue)
)
self._pipe2_thread.daemon = True
self._pipe1_thread.start()
self._pipe2_thread.start()
def poll(self, block=True, timeout=None):
return self._queue.get(block=block, timeout=timeout)
def stop(self):
self._stop = True
def in_notebook():
try:
from IPython import get_ipython
shell = get_ipython().__class__.__name__
if shell == "ZMQInteractiveShell":
return True # Jupyter notebook or qtconsole
if shell == "TerminalInteractiveShell":
return False # Terminal running IPython
return False # Other type or standard python interpreter
except Exception:
return False
return False
def is_free_port(port, host="localhost", timeout=0.2):
"""Check if a port on a given host is in use or not.
Args:
port (int): Port number to check availability.
host (str): Hostname to connect to and check port availability.
timeout (float): Timeout used for socket connection.
Returns:
True if port is available, False otherwise.
"""
if host == "localhost" or host == "127.0.0.1":
try:
return int(port) not in [
conn.laddr.port for conn in psutil.net_connections()
]
except psutil.AccessDenied:
# back to the socket.connect
pass
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
sock.settimeout(timeout)
sock.connect((host, int(port)))
sock.close()
except socket.error:
return True
else:
return False
def get_free_port(host="localhost", port_range=(32768, 64999)):
"""Get a free port on a given host.
Args:
host (str): Hostname you want to get the free port.
port_range (tuple): Try to get free port within this range.
Returns:
A free port on given host.
"""
while True:
port = random.randint(port_range[0], port_range[1])
if is_free_port(port, host=host):
return port
# Find the executables of java family such as java, javac, jar.
# If JAVA_HOME is set, use that,
# otherwise use the executable on the PATH.
def find_java_exe(exe_name="java"):
exe = None
if "JAVA_HOME" in os.environ:
exe = os.path.expandvars(f"$JAVA_HOME/bin/{exe_name}")
if exe is None:
exe = shutil.which(exe_name)
if exe is None:
raise RuntimeError(f"{exe_name} command not found.")
return exe
def get_java_version():
# Use javac to get the version of java since its output is more stable
# in format of "javac 1.8.0_265" or "19.0.2"
# so we need to capture both the major and minor version
javac = find_java_exe("javac")
output = subprocess.run([javac, "-version"], capture_output=True, text=True).stdout
match = re.search(r"javac (\d+\.\d+)", output)
if match:
return match.group(1)
else:
return None
def get_platform_info():
def _get_gcc_version():
gcc = shutil.which("gcc")
if gcc is None:
return None
return subprocess.check_output([gcc, "--version"], stderr=subprocess.STDOUT)
platform_info = (
f"system: {platform.system()}\n"
f"machine: {platform.machine()}\n"
f"platform: {platform.platform()}\n"
f"uname: {platform.uname()}\n"
f"kernel_ver: {platform.version()}\n"
f"mac_ver: {platform.mac_ver()}\n"
f"gcc_ver: {_get_gcc_version()}\n"
f"python_ver: {platform.python_version()}\n"
)
return platform_info
def random_string(nlen):
"""Create random string which length is `nlen`."""
return "".join([random.choice(string.ascii_lowercase) for _ in range(nlen)])
def get_tempdir():
return os.path.join("/", tempfile.gettempprefix())
def get_timestamp(with_milliseconds=True):
"""Get current timestamp.
Returns:
Str of current time in seconds since the Epoch.
Examples:
>>> get_timestamp()
'1639108065.941239'
>>> get_timestamp(with_milliseconds=False)
'1639108065'
"""
t = str(time.time())
if not with_milliseconds:
t = t.split(".")[0]
return t
def read_file_to_bytes(file_path):
abs_dir = os.path.abspath(os.path.expanduser(file_path))
if os.path.isfile(abs_dir):
with open(abs_dir, "rb") as b:
content = b.read()
return content
raise IOError("No such file: " + file_path)
def i_to_attr(i: int) -> attr_value_pb2.AttrValue:
check_argument(isinstance(i, int))
return attr_value_pb2.AttrValue(i=i)
def u_to_attr(i: int) -> attr_value_pb2.AttrValue:
check_argument(isinstance(i, int) and i >= 0)
return attr_value_pb2.AttrValue(u=i)
def b_to_attr(b: bool) -> attr_value_pb2.AttrValue:
check_argument(isinstance(b, bool))
return attr_value_pb2.AttrValue(b=b)
def s_to_attr(s: str) -> attr_value_pb2.AttrValue:
check_argument(isinstance(s, str))
return attr_value_pb2.AttrValue(s=s.encode("utf-8", errors="ignore"))
def bytes_to_attr(s: bytes) -> attr_value_pb2.AttrValue:
check_argument(isinstance(s, bytes))
return attr_value_pb2.AttrValue(s=s)
def bytes_to_large_attr(s: bytes) -> attr_value_pb2.LargeAttrValue:
check_argument(isinstance(s, bytes))
large_attr = attr_value_pb2.LargeAttrValue()
chunk = attr_value_pb2.Chunk(buffer=s)
large_attr.chunk_list.items.append(chunk)
return large_attr
def f_to_attr(f: float) -> attr_value_pb2.AttrValue:
check_argument(isinstance(f, float))
return attr_value_pb2.AttrValue(f=f)
def type_to_attr(t):
return attr_value_pb2.AttrValue(i=t)
def graph_type_to_attr(t):
return attr_value_pb2.AttrValue(i=t)
def modify_type_to_attr(t):
return attr_value_pb2.AttrValue(i=t)
def report_type_to_attr(t):
return attr_value_pb2.AttrValue(i=t)
def list_str_to_attr(list_of_str):
attr = attr_value_pb2.AttrValue()
attr.list.s[:] = [
item.encode("utf-8", errors="ignore") if isinstance(item, str) else item
for item in list_of_str
]
return attr
def list_i_to_attr(list_i):
attr = attr_value_pb2.AttrValue()
attr.list.i[:] = list_i
return attr
def graph_type_to_cpp_class(graph_type):
if graph_type == graph_def_pb2.IMMUTABLE_EDGECUT:
return "grape::ImmutableEdgecutFragment"
if graph_type == graph_def_pb2.DYNAMIC_PROPERTY:
return "gs::DynamicFragment"
if graph_type == graph_def_pb2.DYNAMIC_PROJECTED:
return "gs::DynamicProjectedFragment"
if graph_type == graph_def_pb2.ARROW_PROPERTY:
return "vineyard::ArrowFragment"
if graph_type == graph_def_pb2.ARROW_PROJECTED:
return "gs::ArrowProjectedFragment"
return "null"
def pack(v):
param = Any()
if isinstance(v, bool):
param.Pack(data_types_pb2.BoolValue(value=v))
elif isinstance(v, int):
param.Pack(data_types_pb2.Int64Value(value=v))
elif isinstance(v, float):
param.Pack(data_types_pb2.DoubleValue(value=v))
elif isinstance(v, str):
param.Pack(data_types_pb2.StringValue(value=v))
elif isinstance(v, bytes):
param.Pack(data_types_pb2.BytesValue(value=v))
else:
raise ValueError("Wrong type of query param {}".format(type(v)))
return param
def pack_query_params(*args, **kwargs):
params = []
for i in args:
params.append(pack(i))
for _, v in kwargs.items():
params.append(pack(v))
return params
def is_numpy(*args):
"""Check if the input type is numpy.ndarray."""
for arg in args:
if arg is not None and not isinstance(arg, np.ndarray):
raise ValueError("The parameter %s should be a numpy ndarray" % arg)
def is_file(*args):
"""Check if the input type is file."""
for arg in args:
if arg is not None and not isinstance(arg, str):
raise ValueError("the parameter %s should be a file path" % arg)
def _context_protocol_to_numpy_dtype(dtype):
dtype_map = {
0: np.dtype("void"),
1: np.dtype("bool"),
2: np.dtype("int32"),
3: np.dtype("uint32"),
4: np.dtype("int64"),
5: np.dtype("uint64"),
6: np.dtype("float32"),
7: np.dtype("float64"),
8: object, # string
}
npdtype = dtype_map.get(dtype)
if npdtype is None:
raise NotImplementedError("Don't support type {}".format(dtype))
return npdtype
def decode_numpy(value):
if not value:
raise RuntimeError("Value to decode should not be empty")
archive = OutArchive(value)
shape_size = archive.get_size()
shape = []
for _ in range(shape_size):
shape.append(archive.get_size())
dtype = _context_protocol_to_numpy_dtype(archive.get_int())
array_size = archive.get_size()
check_argument(array_size == np.prod(shape))
if dtype is object:
data_copy = []
for _ in range(array_size):
data_copy.append(archive.get_string())
if shape and shape[0] > 1:
array = np.reshape(data_copy, shape)
else:
array = np.array(data_copy, dtype=dtype)
else:
array = np.ndarray(
shape=shape,
dtype=dtype,
buffer=archive.get_block(array_size * dtype.itemsize),
order="C",
)
return array
def decode_dataframe(value):
if not value:
raise RuntimeError("Value to decode should not be empty")
archive = OutArchive(value)
column_num = archive.get_size()
row_num = archive.get_size()
arrays = {}
for _ in range(column_num):
col_name = archive.get_string()
dtype = _context_protocol_to_numpy_dtype(archive.get_int())
if dtype is object:
data_copy = []
for _ in range(row_num):
data_copy.append(archive.get_string())
array = np.array(data_copy, dtype=dtype)
else:
array = np.ndarray(
shape=(row_num,),
dtype=dtype,
buffer=archive.get_block(row_num * dtype.itemsize),
)
arrays[col_name] = array
return pd.DataFrame(arrays)
def _unify_str_type(t): # noqa: C901
t = t.lower()
if t in ("b", "bool"):
return graph_def_pb2.DataTypePb.BOOL
elif t in ("c", "char"):
return graph_def_pb2.DataTypePb.CHAR
elif t in ("s", "short"):
return graph_def_pb2.DataTypePb.SHORT
elif t in ("i", "int", "int32", "int32_t"):
return graph_def_pb2.DataTypePb.INT
elif t in ("l", "long", "int64_t", "int64"):
return graph_def_pb2.DataTypePb.LONG
elif t in ("uint32_t", "uint32"):
return graph_def_pb2.DataTypePb.UINT
elif t in ("uint64_t", "uint64"):
return graph_def_pb2.DataTypePb.ULONG
elif t in ("f", "float"):
return graph_def_pb2.DataTypePb.FLOAT
elif t in ("d", "double"):
return graph_def_pb2.DataTypePb.DOUBLE
elif t in ("str", "string", "std::string"):
return graph_def_pb2.DataTypePb.STRING
elif t == "bytes":
return graph_def_pb2.DataTypePb.BYTES
elif t == "date32[day]" or t == "date[32][day]" or t == "date32" or t == "date[32]":
return graph_def_pb2.DataTypePb.DATE32
elif t == "date64[ms]" or t == "date[64][ms]" or t == "date64" or t == "date[64]":
return graph_def_pb2.DataTypePb.DATE64
elif t == "time32[s]" or t == "time[32][s]":
return graph_def_pb2.DataTypePb.TIME32_S
elif t == "time32[ms]" or t == "time[32][ms]":
return graph_def_pb2.DataTypePb.TIME32_MS
elif t == "time32[us]" or t == "time[32][us]":
return graph_def_pb2.DataTypePb.TIME32_US
elif t == "time32[ns]" or t == "time[32][ns]":
return graph_def_pb2.DataTypePb.TIME32_NS
elif t == "time64[s]" or t == "time[64][s]":
return graph_def_pb2.DataTypePb.TIME64_S
elif t == "time64[ms]" or t == "time[64][ms]":
return graph_def_pb2.DataTypePb.TIME64_MS
elif t == "time64[us]" or t == "time[64][us]":
return graph_def_pb2.DataTypePb.TIME64_US
elif t == "time64[ns]" or t == "time[64][ns]":
return graph_def_pb2.DataTypePb.TIME64_NS
elif t.startswith("timestamp[s]"):
return graph_def_pb2.DataTypePb.TIMESTAMP_S
elif t.startswith("timestamp[ms]"):
return graph_def_pb2.DataTypePb.TIMESTAMP_MS
elif t.startswith("timestamp[us]"):
return graph_def_pb2.DataTypePb.TIMESTAMP_US
elif t.startswith("timestamp[ns]"):
return graph_def_pb2.DataTypePb.TIMESTAMP_NS
elif t == "int_list" or t.startswith("fixedlistint"):
return graph_def_pb2.DataTypePb.INT_LIST
elif t == "long_list" or t.startswith("fixedlistlong"):
return graph_def_pb2.DataTypePb.LONG_LIST
elif t == "float_list" or t.startswith("fixedlistfloat"):
return graph_def_pb2.DataTypePb.FLOAT_LIST
elif t == "double_list" or t.startswith("fixedlistdouble"):
return graph_def_pb2.DataTypePb.DOUBLE_LIST
elif t in ("empty", "grape::emptytype"):
return graph_def_pb2.NULLVALUE
raise TypeError("Not supported type {}".format(t))
def unify_type(t):
# If type is None, we deduce type from source file.
if t is None:
return graph_def_pb2.DataTypePb.UNKNOWN
if isinstance(t, str):
return _unify_str_type(t)
elif isinstance(t, type):
unify_types = {
int: graph_def_pb2.LONG,
np.int32: graph_def_pb2.INT,
np.int64: graph_def_pb2.LONG,
np.uint32: graph_def_pb2.UINT,
np.uint64: graph_def_pb2.ULONG,
float: graph_def_pb2.DOUBLE,
np.float32: graph_def_pb2.FLOAT,
np.float64: graph_def_pb2.DOUBLE,
str: graph_def_pb2.STRING,
np.str_: graph_def_pb2.STRING,
bool: graph_def_pb2.BOOL,
np.bool8: graph_def_pb2.BOOL,
list: graph_def_pb2.INT_LIST,
tuple: graph_def_pb2.INT_LIST,
dict: graph_def_pb2.DYNAMIC,
}
return unify_types[t]
elif isinstance(t, int): # graph_def_pb2.DataType
return t
raise TypeError("Not supported type {}".format(t))
def data_type_to_cpp(t):
if t == graph_def_pb2.INT:
return "int32_t"
elif t == graph_def_pb2.LONG:
return "int64_t"
elif t == graph_def_pb2.UINT:
return "uint32_t"
elif t == graph_def_pb2.ULONG:
return "uint64_t"
elif t == graph_def_pb2.FLOAT:
return "float"
elif t == graph_def_pb2.DOUBLE:
return "double"
elif t == graph_def_pb2.STRING:
return "std::string"
elif t is None or t == graph_def_pb2.NULLVALUE:
return "grape::EmptyType"
elif t == graph_def_pb2.DYNAMIC:
return "dynamic::Value"
elif t == graph_def_pb2.UNKNOWN:
return ""
raise ValueError("Not support type {}".format(t))
def data_type_to_python(t):
if t in (
graph_def_pb2.INT,
graph_def_pb2.LONG,
graph_def_pb2.UINT,
graph_def_pb2.ULONG,
):
return int
elif t in (graph_def_pb2.FLOAT, graph_def_pb2.DOUBLE):
return float
elif t == graph_def_pb2.STRING:
return str
elif t in (None, graph_def_pb2.NULLVALUE):
return None
raise ValueError("Not support type {}".format(t))
def data_type_to_unified_type(data_type: int):
if data_type == graph_def_pb2.DataTypePb.BOOL:
return {"primitive_type": "DT_BOOL"}
if data_type == graph_def_pb2.DataTypePb.CHAR:
return {"string": {"char": {"fixed_length": 1}}}
if data_type == graph_def_pb2.DataTypePb.INT:
return {"primitive_type": "DT_SIGNED_INT32"}
if data_type == graph_def_pb2.DataTypePb.LONG:
return {"primitive_type": "DT_SIGNED_INT64"}
if data_type == graph_def_pb2.DataTypePb.FLOAT:
return {"primitive_type": "DT_FLOAT"}
if data_type == graph_def_pb2.DataTypePb.DOUBLE:
return {"primitive_type": "DT_DOUBLE"}
if data_type == graph_def_pb2.DataTypePb.STRING:
return {"string": {"long_text": None}}
if data_type == graph_def_pb2.DataTypePb.UINT:
return {"primitive_type": "DT_UNSIGNED_INT32"}
if data_type == graph_def_pb2.DataTypePb.ULONG:
return {"primitive_type": "DT_UNSIGNED_INT64"}
if data_type == graph_def_pb2.DataTypePb.UNKNOWN:
return "UNKNOWN"
def unified_type_to_data_type(unified_type):
if unified_type == {"primitive_type": "DT_BOOL"}:
return graph_def_pb2.DataTypePb.BOOL
if unified_type == {"string": {"char": {"fixed_length": 1}}}:
return graph_def_pb2.DataTypePb.CHAR
if unified_type == {"primitive_type": "DT_SIGNED_INT32"}:
return graph_def_pb2.DataTypePb.INT
if unified_type == {"primitive_type": "DT_SIGNED_INT64"}:
return graph_def_pb2.DataTypePb.LONG
if unified_type == {"primitive_type": "DT_FLOAT"}:
return graph_def_pb2.DataTypePb.FLOAT
if unified_type == {"primitive_type": "DT_DOUBLE"}:
return graph_def_pb2.DataTypePb.DOUBLE
if unified_type == {"string": {"long_text": None}}:
return graph_def_pb2.DataTypePb.STRING
if unified_type == {"primitive_type": "DT_UNSIGNED_INT32"}:
return graph_def_pb2.DataTypePb.UINT
if unified_type == {"primitive_type": "DT_UNSIGNED_INT64"}:
return graph_def_pb2.DataTypePb.ULONG
if unified_type == "UNKNOWN":
return graph_def_pb2.DataTypePb.UNKNOWN
raise ValueError("Not support unified type {}".format(unified_type))
def normalize_data_type_str(data_type):
data_type = data_type.lower()
if data_type in ("int8", "int8_t"):
return "int8_t"
if data_type in ("int16", "int16_t"):
return "int16_t"
if data_type in ("int", "int32_t", "int32"):
return "int32_t"
elif data_type in ("long", "int64_t", "int64"):
return "int64_t"
elif data_type in ("uint32_t", "uint32"):
return "uint32_t"
elif data_type in ("uint64_t", "uint64"):
return "uint64_t"
elif data_type in ("str", "string", "std::string"):
return "std::string"
else:
return data_type
def vertex_map_type_to_enum(vertex_map):
if isinstance(vertex_map, str):
if vertex_map == "global":
vertex_map = graph_def_pb2.GLOBAL_VERTEX_MAP
elif vertex_map == "local":
vertex_map = graph_def_pb2.LOCAL_VERTEX_MAP
else:
raise ValueError("vertex_map can only be global or local.")
elif isinstance(vertex_map, int):
assert vertex_map in (
graph_def_pb2.GLOBAL_VERTEX_MAP,
graph_def_pb2.LOCAL_VERTEX_MAP,
)
return vertex_map
def vertex_map_type_to_cpp(t):
if t == graph_def_pb2.GLOBAL_VERTEX_MAP:
return "vineyard::GlobalVertexMap"
elif t == graph_def_pb2.LOCAL_VERTEX_MAP:
return "vineyard::LocalVertexMap"
else:
raise ValueError("Not support vertex map type {}".format(t))
def transform_vertex_range(vertex_range):
if vertex_range:
return json.dumps(vertex_range)
return None
def _from_numpy_dtype(dtype):
dtype_reverse_map = {
np.dtype(np.int8): types_pb2.INT8,
np.dtype(np.int16): types_pb2.INT16,
np.dtype(np.int32): types_pb2.INT32,
np.dtype(np.int64): types_pb2.INT64,
np.dtype(np.uint8): types_pb2.UINT8,
np.dtype(np.uint16): types_pb2.UINT16,
np.dtype(np.uint32): types_pb2.UINT32,
np.dtype(np.uint64): types_pb2.UINT64,
np.dtype(np.intc): types_pb2.INT,
np.dtype(int): types_pb2.LONG,
np.dtype(bool): types_pb2.BOOLEAN,
np.dtype(float): types_pb2.FLOAT,
np.dtype(np.double): types_pb2.DOUBLE,
np.dtype(object): types_pb2.STRING,
}
pbdtype = dtype_reverse_map.get(dtype)
if pbdtype is None:
raise NotImplementedError("Do not support type {}".format(dtype))
return pbdtype
def _to_numpy_dtype(dtype):
dtype_map = {
types_pb2.INT8: np.int8,
types_pb2.INT16: np.int16,
types_pb2.INT32: np.int32,
types_pb2.INT64: np.int64,
types_pb2.UINT8: np.uint8,
types_pb2.UINT16: np.uint16,
types_pb2.UINT32: np.uint32,
types_pb2.UINT64: np.uint64,
types_pb2.INT: np.intc,
types_pb2.LONG: int,
types_pb2.BOOLEAN: bool,
types_pb2.FLOAT: float,
types_pb2.DOUBLE: np.double,
types_pb2.STRING: object,
}
npdtype = dtype_map.get(dtype)
if npdtype is None:
raise NotImplementedError("Do not support type {}".format(dtype))
return npdtype
def deprecated(msg):
def decorator(func):
def wrapper(*args, **kwargs):
warnings.warn(f"Deprecated usage: {msg}", DeprecationWarning, stacklevel=2)
return func(*args, **kwargs)
return wrapper
return decorator
def apply_docstring(fn):
"""Apply the docstring of `fn` to annotated function."""
def decorator(func):
func.__doc__ = fn.__doc__
return func
return decorator