#include "stdafx.h"
#include "SparseDataStorage.h"

// Note: See header for an overview of this class

int SparseDataStorage::deleteQueueIndex;
XLockFreeStack <unsigned char> SparseDataStorage::deleteQueue[3];

void SparseDataStorage::staticCtor()
{
	for( int i = 0; i < 3; i++ )
	{
		deleteQueue[i].Initialize();
	}
}

// Initialise data storage, with very limited compression - the very first plane is stored as either compressed to be "all 0", and the rest of the planes aren't compressed at all.
// The reason behind this is to keep the total allocation as a round number of 4K (small) pages, ie 16K.
// By doing this, and doing this "special" allocation as a XPhysicalAlloc rather than a malloc, we can help ensure that this full allocation gets cleaned up properly when the first
// proper compression is done on this storage. If it were just allocated with malloc, then the memory management system would have a large number of 16512 allocations to free, and
// it seems from experimentation that these basically don't make it back to the system as free pages.
// Note - the other approach here would be to allocate *no* actual storage for the data at the ctor stage. However, as chunks are created then this creates an awful lot of intermediate
// stages as each line of data is added, so it is actually much cleaner to just allocate almost fully here & then attempt to do a single compression pass over the data later on.
SparseDataStorage::SparseDataStorage()
{
	// Allocate using physical alloc. As this will (by default) return memory from the pool of 4KB pages, the address will in the range of MM_PHYSICAL_4KB_BASE upwards. We can use
	// this fact to identify the allocation later, and so free it with the corresponding call to XPhysicalFree.
	unsigned char *planeIndices = (unsigned char *)malloc(128 * 128);
	unsigned char *data = planeIndices + 128;
	planeIndices[0] = ALL_0_INDEX;
	for( int i = 1; i < 128; i++ )
	{
		planeIndices[i] = i - 1;
	}
	XMemSet(data, 0, 128 * 127);

	// Data and count packs together the pointer to our data and the count of planes allocated - 127 planes allocated in this case
#pragma warning ( disable : 4826 )
	dataAndCount = 0x007F000000000000L | (( (__int64) planeIndices ) & 0x0000ffffffffffffL);
#pragma warning ( default : 4826 )
#ifdef DATA_COMPRESSION_STATS
	count = 128;
#endif
}

SparseDataStorage::SparseDataStorage(bool isUpper)
{
	// Allocate using physical alloc. As this will (by default) return memory from the pool of 4KB pages, the address will in the range of MM_PHYSICAL_4KB_BASE upwards. We can use
	// this fact to identify the allocation later, and so free it with the corresponding call to XPhysicalFree.
	unsigned char *planeIndices = (unsigned char *)malloc(128);
	for( int i = 0; i < 128; i++ )
	{
		planeIndices[i] = ALL_0_INDEX;
	}

	// Data and count packs together the pointer to our data and the count of planes allocated - 127 planes allocated in this case
#pragma warning ( disable : 4826 )
	dataAndCount = 0x0000000000000000L | (( (__int64) planeIndices ) & 0x0000ffffffffffffL);
#pragma warning ( default : 4826 )
#ifdef DATA_COMPRESSION_STATS
	count = 128;
#endif
}

SparseDataStorage::~SparseDataStorage()
{
	unsigned char *indicesAndData = (unsigned char *)(dataAndCount & 0x0000ffffffffffff);
	// Determine correct means to free this data - could have been allocated either with XPhysicalAlloc or malloc
	
	{
		free(indicesAndData);
	}
//	printf("Free (in dtor) 0x%x\n", indicesAndData);
}

SparseDataStorage::SparseDataStorage(SparseDataStorage *copyFrom)
{
	// Extra details of source storage
	__int64 sourceDataAndCount = copyFrom->dataAndCount;
	unsigned char *sourceIndicesAndData = (unsigned char *)(sourceDataAndCount & 0x0000ffffffffffff);
	int sourceCount = (sourceDataAndCount >> 48 ) & 0xffff;

	// Allocate & copy indices ( 128 bytes ) and any allocated planes (128 * count)
	unsigned char *destIndicesAndData = (unsigned char *)malloc( sourceCount * 128 + 128 );

	// AP - I've moved this to be before the memcpy because of a very strange bug on vita. Sometimes dataAndCount wasn't valid in time when ::get was called.
	// This should never happen and this isn't a proper solution but fixes it for now.
#pragma warning ( disable : 4826 )
	dataAndCount = ( sourceDataAndCount & 0xffff000000000000L ) | ( ((__int64) destIndicesAndData ) & 0x0000ffffffffffffL );
#pragma warning ( default : 4826 )

	XMemCpy( destIndicesAndData, sourceIndicesAndData, sourceCount * 128 + 128 );

#ifdef DATA_COMPRESSION_STATS
	count = sourceCount;
#endif
}

// Set all data values from a data array of length 16384 (128 x 16 x 16 x 0.5). Source data must have same order as original java game
void SparseDataStorage::setData(byteArray dataIn, unsigned int inOffset)
{
	//  Original order is defined as:
	//  pos = (x << 11 | z << 7 | y);
	//  slot = pos >> 1;
	//  part = pos & 1;
	//  if ( part == 0 ) value = data[slot] & 0xf
	//  else value = (data[slot] >> 4) & 0xf

	// Two passed through the data. First pass sets up plane indices, and counts number of planes that we actually need to allocate
	int allocatedPlaneCount = 0;
	unsigned char _planeIndices[128];

	//unsigned char *lastDataPointer = (unsigned char *)(dataAndCount & 0x0000ffffffffffff);

	for( int y = 0; y < 128; y++ )
	{
		bool all0 = true;
			
		for( int xz = 0; xz < 256; xz++ )	// 256 in loop as 16 x 16 separate bytes need checked
		{
			int pos = ( xz << 7 ) | y;
			int slot = pos >> 1;
			int part = pos & 1;
			unsigned char value = ( dataIn[slot + inOffset] >> (part * 4) ) & 15;
			if( value != 0 ) all0 = false;
		}
		if( all0 )
		{
			_planeIndices[y] = ALL_0_INDEX;
		}
		else
		{
			_planeIndices[y] = allocatedPlaneCount++;
		}
	}

	// Allocate required storage
	unsigned char *planeIndices = (unsigned char *)malloc(128 * allocatedPlaneCount + 128);
	unsigned char *data = planeIndices + 128;
	XMemCpy(planeIndices, _planeIndices, 128);

	// Second pass through to actually copy the data in to the storage allocated for the required planes
	unsigned char *pucOut = data;
	for( int y = 0; y < 128 ; y++ )
	{
		// Index will be < 128 if we allocated storage for it and it has a valid index. No need to actually check the index as
		// we know they were sequentially allocated above.
		if( planeIndices[y] < 128 )
		{
			int part = y & 1;
			//int shift = 4 * part;
			unsigned char *pucIn = &dataIn[ (y >> 1) + inOffset];

			for( int xz = 0; xz < 128; xz++ )	// 128 ( 16 x 16 x 0.5 ) in loop as packing 2 values into each destination byte
			{
				*pucOut = ( ( *pucIn ) >> ( part * 4 ) ) & 15;
				pucIn += 64;

				*pucOut |= ( ( ( *pucIn ) >> ( part * 4 ) ) & 15 ) << 4;
				pucIn += 64;
				pucOut++;
			}
		}
	}

	// Get new data and count packed info
#pragma warning ( disable : 4826 )
	__int64 newDataAndCount = ((__int64) planeIndices) & 0x0000ffffffffffffL;
#pragma warning ( default : 4826 )
	newDataAndCount |= ((__int64)allocatedPlaneCount) << 48;

	updateDataAndCount( newDataAndCount );
}

// Gets all data values into an array of length 16384. Destination data will have same order as original java game.
void SparseDataStorage::getData(byteArray retArray, unsigned int retOffset)
{
	XMemSet(retArray.data +  + retOffset, 0, 16384);
	unsigned char *planeIndices, *data;
	getPlaneIndicesAndData(&planeIndices, &data);

	//  Original order is defined as:
	//  pos = (x << 11 | z << 7 | y);
	//  slot = pos >> 1;
	//  part = pos & 1;
	//  if ( part == 0 ) value = data[slot] & 0xf
	//  else value = (data[slot] >> 4) & 0xf

	for( int y = 0; y < 128; y++ )
	{
		if( planeIndices[y] == ALL_0_INDEX )
		{
			// No need to do anything in this case as retArray is initialised to zero
		}
		else
		{
			int part = y & 1;
			int shift = 4 * part;
			unsigned char *pucOut = &retArray.data[ (y >> 1) +  + retOffset];
			unsigned char *pucIn = &data[ planeIndices[ y ] * 128 ];
			for( int xz = 0; xz < 128; xz++ )		// 128 in loop (16 x 16 x 0.5) as input data is being treated in pairs of nybbles that are packed in the same byte
			{
				unsigned char value = (*pucIn) & 15;
				*pucOut |= ( value << shift );
				pucOut += 64;

				value = ((*pucIn) >> 4 ) & 15;
				*pucOut |= ( value << shift );
				pucOut += 64;

				pucIn++;
			}
		}
	}
}

// Get an individual data value
int SparseDataStorage::get(int x, int y, int z)
{
	unsigned char *planeIndices, *data;
	getPlaneIndicesAndData(&planeIndices, &data);
	
	if( planeIndices[y] == ALL_0_INDEX )
	{
		return 0;
	}
	else
	{
		int planeIndex = x * 16 + z;				// Index within this xz plane
		int byteIndex = planeIndex / 2;				// Byte index within the plane (2 tiles stored per byte)
		int shift = ( planeIndex & 1 ) * 4;			// Bit shift within the byte
		int retval = ( data[ planeIndices[y] * 128 + byteIndex ] >> shift ) & 15;

		return retval;
	}
}

// Set an individual data value
void SparseDataStorage::set(int x, int y, int z, int val)
{
	unsigned char *planeIndices, *data;
	getPlaneIndicesAndData(&planeIndices, &data);

	// If this plane isn't yet allocated, then we might have some extra work to do
	if( planeIndices[y] >= ALL_0_INDEX )
	{
		// No data allocated. Early out though if we are storing what is already represented by our special index.
		if( ( val == 0 ) && ( planeIndices[y] == ALL_0_INDEX ) )
		{
			return;
		}

		// Reallocate the storage for planes to accomodate one extra
		addNewPlane(y);

		// Get pointers again as these may have moved
		getPlaneIndicesAndData(&planeIndices, &data);
	}

	// Either data was already allocated, or we've just done that. Now store our value into the right place.

	int planeIndex = x * 16 + z;				// Index within this xz plane
	int byteIndex = planeIndex / 2;				// Byte index within the plane (2 tiles stored per byte)
	int shift = ( planeIndex & 1 ) * 4;			// Bit shift within the byte
	int mask = 0xf0 >> shift;

	int idx = planeIndices[y] * 128 + byteIndex;
	data[idx] = ( data[idx] & mask ) | ( val << shift );

}

// Sets a region of data values with the data at offset position in the array dataIn - external ordering compatible with java DataLayer
// Note - when data was extracted from the original data layers by LevelChunk::getBlocksAndData, y0 had to have even alignment and y1 - y0 also
// needed to be even as data was packed in nyblles in this dimension, and the code didn't make any attempt to unpack it. This behaviour is copied
// here for compatibility even though our source data isn't packed this way.
// Returns size of data copied.
int SparseDataStorage::setDataRegion(byteArray dataIn, int x0, int y0, int z0, int x1, int y1, int z1, int offset, tileUpdatedCallback callback, void *param, int yparam)
{
	// Actual setting of data happens when calling set method so no need to lock here
	unsigned char *pucIn = &dataIn.data[offset];
	if( callback )
	{
		for( int x = x0; x < x1; x++ )
		{
			for( int z = z0; z < z1; z++ )
			{
				// Emulate how data was extracted from DataLayer... see comment above
				int yy0 = y0 & 0xfffffffe;
				int len = ( y1 - y0 ) / 2;
				for( int i = 0; i < len; i++ )
				{
					int y = yy0 + ( i * 2 );

					int toSet = (*pucIn) & 15;
					if( get(x, y, z) != toSet )
					{
						set(x, y, z, toSet );
						callback(x, y, z, param, yparam);
					}
					toSet = ((*pucIn) >> 4 ) & 15;
					if( get(x, y + 1, z) != toSet )
					{
						set(x, y + 1, z, toSet );
						callback(x, y + 1, z, param, yparam);
					}
					pucIn++;
				}
			}
		}
	}
	else
	{
		for( int x = x0; x < x1; x++ )
		{
			for( int z = z0; z < z1; z++ )
			{
				// Emulate how data was extracted from DataLayer... see comment above
				int yy0 = y0 & 0xfffffffe;
				int len = ( y1 - y0 ) / 2;
				for( int i = 0; i < len; i++ )
				{
					int y = yy0 + ( i * 2 );

					set(x, y, z, (*pucIn) & 15 );
					set(x, y + 1, z, ((*pucIn) >> 4 ) & 15 );
					pucIn++;
				}
			}
		}
	}
	ptrdiff_t count = pucIn - &dataIn.data[offset];

	return (int)count;
}

// Updates the data at offset position dataInOut with a region of data information - external ordering compatible with java DataLayer
// Note - when data was placed in the original data layers by LevelChunk::setBlocksAndData, y0 had to have even alignment and y1 - y0 also
// needed to be even as data was packed in nyblles in this dimension, and the code didn't make any attempt to unpack it. This behaviour is copied
// here for compatibility even though our source data isn't packed this way
// Returns size of data copied.
int SparseDataStorage::getDataRegion(byteArray dataInOut, int x0, int y0, int z0, int x1, int y1, int z1, int offset)
{
	unsigned char *pucOut = &dataInOut.data[offset];
	for( int x = x0; x < x1; x++ )
	{
		for( int z = z0; z < z1; z++ )
		{
			// Emulate how data was extracted from DataLayer... see comment above
			int yy0 = y0 & 0xfffffffe;
			int len = ( y1 - y0 ) / 2;
			for( int i = 0; i < len; i++ )
			{
				int y = yy0 + ( i * 2 );

				*pucOut = get( x, y, z);
				*pucOut |= get( x, y + 1, z) << 4;
				pucOut++;
			}
		}
	}
	ptrdiff_t count = pucOut - &dataInOut.data[offset];
	
	return (int)count;
}

void SparseDataStorage::addNewPlane(int y)
{
	bool success = false;
	do
	{
		// Get last packed data pointer & count
		__int64 lastDataAndCount = dataAndCount;

		// Unpack count & data pointer
		int lastLinesUsed = (int)(( lastDataAndCount >> 48 ) & 0xffff);
		unsigned char *lastDataPointer = (unsigned char *)(lastDataAndCount & 0x0000ffffffffffff);

		// Find out what to prefill the newly allocated line with
		unsigned char planeIndex = lastDataPointer[y];
		
		if( planeIndex < ALL_0_INDEX ) return;			// Something has already allocated this line - we're done
		
		int linesUsed = lastLinesUsed + 1;

		// Allocate new memory storage, copy over anything from old storage, and initialise remainder
		unsigned char *dataPointer = (unsigned char *)malloc(linesUsed * 128 + 128);
		XMemCpy( dataPointer, lastDataPointer, 128 * lastLinesUsed + 128);
		XMemSet( dataPointer + ( 128 * lastLinesUsed ) + 128, 0, 128 );
		dataPointer[y] = lastLinesUsed;

		// Get new data and count packed info
#pragma warning ( disable : 4826 )
		__int64 newDataAndCount = ((__int64) dataPointer) & 0x0000ffffffffffffL;
#pragma warning ( default : 4826 )
		newDataAndCount |= ((__int64)linesUsed) << 48;

		// Attempt to update the data & count atomically. This command will Only succeed if the data stored at
		// dataAndCount is equal to lastDataAndCount, and will return the value present just before the write took place
		__int64 lastDataAndCount2 = InterlockedCompareExchangeRelease64( &dataAndCount, newDataAndCount, lastDataAndCount );
		
		if( lastDataAndCount2 == lastDataAndCount )
		{
			success = true;
			// Queue old data to be deleted
			queueForDelete( lastDataPointer );
//			printf("Marking for delete 0x%x\n", lastDataPointer);
#ifdef DATA_COMPRESSION_STATS
			count = linesUsed;
#endif
		}
		else
		{
			// If we didn't succeed, queue data that we made to be deleted, and try again
			queueForDelete( dataPointer );
//			printf("Marking for delete (fail) 0x%x\n", dataPointer);
		}
	} while( !success );
}

void SparseDataStorage::getPlaneIndicesAndData(unsigned char **planeIndices, unsigned char **data)
{
	unsigned char *indicesAndData = (unsigned char *)(dataAndCount & 0x0000ffffffffffff);

	*planeIndices = indicesAndData;
	*data = indicesAndData + 128;

}

void SparseDataStorage::queueForDelete(unsigned char *data)
{
	// Add this into a queue for deleting. This shouldn't be actually deleted until tick has been called twice from when
	// the data went into the queue.
	deleteQueue[deleteQueueIndex].Push( data );
}

void SparseDataStorage::tick()
{
	// We have 3 queues for deleting. Always delete from the next one after where we are writing to, so it should take 2 ticks
	// before we ever delete something, from when the request to delete it came in
	int freeIndex = ( deleteQueueIndex + 1 ) % 3;

//	printf("Free queue: %d, %d\n",deleteQueue[freeIndex].GetEntryCount(),deleteQueue[freeIndex].GetAllocated());
	unsigned char *toFree = NULL;
	do
	{
		toFree = deleteQueue[freeIndex].Pop();
//		if( toFree ) printf("Deleting 0x%x\n", toFree);
		// Determine correct means to free this data - could have been allocated either with XPhysicalAlloc or malloc
		{
			free(toFree);
		}
	} while( toFree );

	deleteQueueIndex = ( deleteQueueIndex + 1 ) % 3;
}

// Update storage with a new values for dataAndCount, repeating as necessary if other simultaneous writes happen.
void SparseDataStorage::updateDataAndCount(__int64 newDataAndCount)
{
	// Now actually assign this data to the storage. Just repeat until successful, there isn't any useful really that we can merge the results of this
	// with any other simultaneous writes that might be happening.
	bool success = false;
	do
	{
		__int64 lastDataAndCount = dataAndCount;
		unsigned char *lastDataPointer = (unsigned char *)(lastDataAndCount & 0x0000ffffffffffff);

		// Attempt to update the data & count atomically. This command will Only succeed if the data stored at
		// dataAndCount is equal to lastDataAndCount, and will return the value present just before the write took place
		__int64 lastDataAndCount2 = InterlockedCompareExchangeRelease64( &dataAndCount, newDataAndCount, lastDataAndCount );
		
		if( lastDataAndCount2 == lastDataAndCount )
		{
			success = true;
			// Queue old data to be deleted
//			printf("Marking for delete 0x%x (full replace)\n", lastDataPointer);
			queueForDelete( lastDataPointer );
		}
	} while( !success);

#ifdef DATA_COMPRESSION_STATS
	count = ( newDataAndCount >> 48 ) & 0xffff;
#endif

}

// Attempt to compress the stored data. This method makes no guarantee of success - if it fails due to something else writing to the storage whilst this is running, then it won't actually do anything.
int SparseDataStorage::compress()
{
	unsigned char _planeIndices[128];
	bool needsCompressed = false;

	__int64 lastDataAndCount = dataAndCount;

	unsigned char *planeIndices = (unsigned char *)(lastDataAndCount & 0x0000ffffffffffff);
	unsigned char *data = planeIndices + 128;

	int planesToAlloc = 0;
	for( int i = 0; i < 128; i++ )
	{
		if( planeIndices[i] == ALL_0_INDEX )
		{
			_planeIndices[i] = ALL_0_INDEX;
		}
		else
		{
			unsigned char *pucData = &data[ 128 * planeIndices[i] ];
			bool all0 = true;
			for( int j = 0; j < 128; j++ )	// 16 x 16 x 4-bits
			{
				if( *pucData != 0 ) all0 = false;
				pucData++;
			}
			if( all0 )
			{
				_planeIndices[i] = ALL_0_INDEX;
				needsCompressed = true;
			}
			else
			{
				_planeIndices[i] = planesToAlloc++;
			}
		}
	}

	if( needsCompressed )
	{
		unsigned char *newIndicesAndData = (unsigned char *)malloc( 128 + 128 * planesToAlloc );
		unsigned char *pucData = newIndicesAndData + 128;
		XMemCpy( newIndicesAndData, _planeIndices, 128 );

		for( int i = 0; i < 128; i++ )
		{
			if( newIndicesAndData[i] < ALL_0_INDEX )
			{
				XMemCpy( pucData, &data[ 128 * planeIndices[i] ], 128 );
				pucData += 128;
			}
		}

		// Get new data and count packed info
#pragma warning ( disable : 4826 )
		__int64 newDataAndCount = ((__int64) newIndicesAndData) & 0x0000ffffffffffffL;
#pragma warning ( default : 4826 )
		newDataAndCount |= ((__int64)planesToAlloc) << 48;

		// Attempt to update the data & count atomically. This command will Only succeed if the data stored at
		// dataAndCount is equal to lastDataAndCount, and will return the value present just before the write took place
		__int64 lastDataAndCount2 = InterlockedCompareExchangeRelease64( &dataAndCount, newDataAndCount, lastDataAndCount );

		if( lastDataAndCount2 != lastDataAndCount )
		{
			// Failed to write. Don't bother trying again... being very conservative here.
//			printf("Marking for delete 0x%x (compress fail)\n", newIndicesAndData);
			queueForDelete( newIndicesAndData );
		}
		else
		{
			// Success
			queueForDelete( planeIndices );
//			printf("Successfully compressed to %d planes, to delete 0x%x\n", planesToAlloc, planeIndices); 
#ifdef DATA_COMPRESSION_STATS
			count = planesToAlloc;
#endif
		}

		return planesToAlloc;
	}
	else
	{
		return (int)((lastDataAndCount >> 48 ) & 0xffff);
	}
}


bool SparseDataStorage::isCompressed()
{

	int	count = ( dataAndCount >> 48 ) & 0xffff;
	return (count < 127);


}

void SparseDataStorage::write(DataOutputStream *dos)
{
	int count = ( dataAndCount >> 48 ) & 0xffff;
	dos->writeInt(count);
	unsigned char *dataPointer = (unsigned char *)(dataAndCount & 0x0000ffffffffffff);
	byteArray wrapper(dataPointer, count * 128 + 128);
	dos->write(wrapper);
}

void SparseDataStorage::read(DataInputStream *dis)
{
	int count = dis->readInt();
	unsigned char *dataPointer = (unsigned char *)malloc(count * 128 + 128);
	byteArray wrapper(dataPointer, count * 128 + 128);
	dis->readFully(wrapper);

#pragma warning ( disable : 4826 )
	__int64 newDataAndCount = ((__int64) dataPointer) & 0x0000ffffffffffffL;
#pragma warning ( default : 4826 )
	newDataAndCount |= ((__int64)count) << 48;

	updateDataAndCount(newDataAndCount);
}
