void PythonOutputDataSet::RetrieveStringColumnFromDataFrame()

in language-extensions/python/src/PythonDataSet.cpp [1093:1235]


void PythonOutputDataSet::RetrieveStringColumnFromDataFrame(
	string      columnName,
	SQLULEN     &columnSize,
	SQLSMALLINT &decimalDigits,
	SQLSMALLINT &nullable)
{
	LOG("PythonOutputDataSet::RetrieveStringColumnFromDataFrame");

	vector<char> columnData;
	SQLINTEGER *strLenOrNullMap = nullptr;
	if (m_rowsNumber > 0)
	{
		strLenOrNullMap = new SQLINTEGER[m_rowsNumber];
	}

	decimalDigits = 0;
	nullable = SQL_NO_NULLS;

	np::ndarray column = ExtractArrayFromDataFrame(columnName);
	string dType = bp::extract<string>(bp::str(column.get_dtype()));

	// Store the data in a vector (columnData) while we calculate the total size we need 
	// for the contiguous char data.
	//
	SQLINTEGER maxLen = sizeof(char);
	int fullSize = 0;
	for (SQLULEN row = 0; row < m_rowsNumber; ++row)
	{
		string stringToAdd = "";
		bp::object pyObj = column[row];
		bool null = true;

		if (!pyObj.is_none())
		{
			null = false;

			// If we have a bytes object we want to take only the bytes, not the b'' around them
			//
			if (PyBytes_Check(pyObj.ptr()))
			{
				// Extract the size and bytes of the pyObj
				//
				PyObject *baseObj = pyObj.ptr();

				int size = PyBytes_Size(baseObj);
				char *bytes = static_cast<char *>(
					static_cast<void *>(PyBytes_AsString(baseObj)));

				stringToAdd = string(bytes, size);
			}
			else
			{
				// Extract a utf-8 encoded version of the string
				//
				stringToAdd = bp::extract<string>(bp::str(pyObj).encode("utf-8"));
				
				// If we are dealing with date types in a streaming case, 
				// we have to clean up the datetime string so that SQL can convert properly
				//
				if (m_isStreaming && regex_match(dType, regex(".*date.*")))
				{
					// NaT counts as None / NULL for date types
					//
					if (stringToAdd.compare("NaT") == 0)
					{
						null = true;
					}
					else 
					{
						// Python Datetimes will look like this: YYYY-MM-DDThh:mm:ss.fffffffff
						// We trim trailing 0s in the fractional part of the timestamp 
						// (and the "." if the fraction is all 0) so SQL can auto-convert 
						// from string to the date type. 
						// Lower precision date types cannot handle trailing zeroes.
						//
						smatch matches;
						string trimmedDate;

						if (regex_search(stringToAdd, matches, regex("((.*)([.].*?))(0+)$"))) 
						{
							// "matches" will have this by index:
							// 0. YYYY-MM-DDThh:mm:ss.fffff0000
							// 1. YYYY-MM-DDThh:mm:ss.fffff
							// 2. YYYY-MM-DDThh:mm:ss
							// 3. .fffff
							// If there is no fractional second then #3 is just ".", and #1 will
							// have the trailing "." which we don't want, so we use #2 instead.
							// 
							trimmedDate = matches[1].str();
							if (matches[3].str() == ".") 
							{
								trimmedDate = matches[2].str();
							}

							stringToAdd = trimmedDate;
						}
					}
				}
			}
		}

		if (!null)
		{
			fullSize += stringToAdd.size();
			strLenOrNullMap[row] = stringToAdd.size();
			columnData.insert(columnData.end(), stringToAdd.begin(), stringToAdd.end());
		}
		else
		{
			strLenOrNullMap[row] = SQL_NULL_DATA;
			nullable = SQL_NULLABLE;
		}

		// Store the maximum length to find the widest the column needs to be
		//
		if (maxLen < strLenOrNullMap[row])
		{
			maxLen = strLenOrNullMap[row];
		}
	}

	// Create a single block of memory that will hold all the data contiguously.
	//
	unique_ptr<char[]> dataPtr(new char[fullSize]);

	// Copy our data from the vector of strings to the single chunk of memory.
	//
	memcpy(dataPtr.get(), columnData.data(), fullSize);

	columnSize = maxLen;
	if (m_rowsNumber > 0)
	{
		// Add the memory block to the m_data and give the memory handling job to m_data.
		//
		m_data.push_back(static_cast<SQLPOINTER>(dataPtr.release()));
	}
	else
	{
		m_data.push_back(nullptr);
	}

	m_columnNullMap.push_back(strLenOrNullMap);
}