in src/python/turicreate/data_structures/sframe.py [0:0]
def __init__(self, data=None, format="auto", _proxy=None):
"""__init__(data=list(), format='auto')
Construct a new SFrame from a url or a pandas.DataFrame.
"""
# emit metrics for num_rows, num_columns, and type (local://, s3, hdfs, http)
if _proxy:
self.__proxy__ = _proxy
else:
self.__proxy__ = UnitySFrameProxy()
_format = None
if six.PY2 and isinstance(data, unicode):
data = data.encode("utf-8")
if format == "auto":
if HAS_PANDAS and isinstance(data, pandas.DataFrame):
_format = "dataframe"
elif isinstance(data, str) or (
sys.version_info.major < 3 and isinstance(data, unicode)
):
if data.endswith((".csv", ".csv.gz")):
_format = "csv"
elif data.endswith((".tsv", ".tsv.gz")):
_format = "tsv"
elif data.endswith((".txt", ".txt.gz")):
print(
"Assuming file is csv. For other delimiters, "
+ "please use `SFrame.read_csv`."
)
_format = "csv"
else:
_format = "sframe"
elif type(data) == SArray:
_format = "sarray"
elif isinstance(data, SFrame):
_format = "sframe_obj"
elif isinstance(data, dict):
_format = "dict"
elif _is_non_string_iterable(data):
_format = "array"
elif data is None:
_format = "empty"
else:
raise ValueError("Cannot infer input type for data " + str(data))
else:
_format = format
with cython_context():
if _format == "dataframe":
for c in data.columns.values:
self.add_column(SArray(data[c].values), str(c), inplace=True)
elif _format == "sframe_obj":
for col in data.column_names():
self.__proxy__.add_column(data[col].__proxy__, col)
elif _format == "sarray":
self.__proxy__.add_column(data.__proxy__, "")
elif _format == "array":
if len(data) > 0:
unique_types = set([type(x) for x in data if x is not None])
if len(unique_types) == 1 and SArray in unique_types:
for arr in data:
self.add_column(arr, inplace=True)
elif SArray in unique_types:
raise ValueError(
"Cannot create SFrame from mix of regular values and SArrays"
)
else:
self.__proxy__.add_column(SArray(data).__proxy__, "")
elif _format == "dict":
# Validate that every column is the same length.
if len(set(len(value) for value in data.values())) > 1:
# probably should be a value error. But we used to raise
# runtime error here...
raise RuntimeError("All column should be of the same length")
# split into SArray values and other iterable values.
# We convert the iterable values in bulk, and then add the sarray values as columns
sarray_keys = sorted(
key
for key, value in six.iteritems(data)
if isinstance(value, SArray)
)
self.__proxy__.load_from_dataframe(
{
key: value
for key, value in six.iteritems(data)
if not isinstance(value, SArray)
}
)
for key in sarray_keys:
self.__proxy__.add_column(data[key].__proxy__, key)
elif _format == "csv":
url = data
tmpsf = SFrame.read_csv(url, delimiter=",", header=True)
self.__proxy__ = tmpsf.__proxy__
elif _format == "tsv":
url = data
tmpsf = SFrame.read_csv(url, delimiter="\t", header=True)
self.__proxy__ = tmpsf.__proxy__
elif _format == "sframe":
url = _make_internal_url(data)
self.__proxy__.load_from_sframe_index(url)
elif _format == "empty":
pass
else:
raise ValueError("Unknown input type: " + format)