initial

2025-06-04 03:22:50 +02:00
parent f234f23848
commit f12416cffd
14243 changed files with 6446499 additions and 26 deletions
--- a/materialsystem/ps3gcm/Profile.h
+++ b/materialsystem/ps3gcm/Profile.h
@@ -0,0 +1,76 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// SPU Profiling
+//
+//==================================================================================================
+
+#ifndef INCLUDED_CELLMGR_SPU_PROFILE_H
+#define INCLUDED_CELLMGR_SPU_PROFILE_H
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#include <stdint.h>
+
+//--------------------------------------------------------------------------------------------------
+// Defines
+//--------------------------------------------------------------------------------------------------
+
+// Uncomment to enabled profiling
+//#define ENABLE_SPU_PROFILE
+
+//--------------------------------------------------------------------------------------------------
+// Constants
+//--------------------------------------------------------------------------------------------------
+
+const int NUM_BOOKMARKS_IN_EVENT = 6;
+
+//--------------------------------------------------------------------------------------------------
+// Functions
+//--------------------------------------------------------------------------------------------------
+
+/*
+* Insert a marker that is displayed in Tuner
+*/ 
+void insert_bookmark( uint32_t bookmark );
+
+/*
+* 400 cycles delay per bookmark when emitting bookmarks on multiple SPUs
+*/
+void bookmark_delay( int NumBookmarks );
+
+/*
+* Inserting 6 SPU bookmarks, which will
+* be identified by Tuner as a start event
+*/
+void raw_spu_prof_start( int iLevel, uint16_t lsa );
+
+/*
+* Inserting 6 SPU bookmarks, which will
+* be identified by Tuner as a stop event
+*/
+void raw_spu_prof_stop( uint16_t lsa );
+
+/*
+*Profiling macros
+*/
+#ifdef ENABLE_SPU_PROFILE
+
+#define BEGIN_PROFILE(level) raw_spu_prof_start(level, 0)
+#define END_PROFILE(level) raw_spu_prof_stop(level)
+
+#define BEGIN_BOOKMARK(colour) insert_bookmark( colour )
+#define END_BOOKMARK(colour)
+
+#else
+
+#define BEGIN_PROFILE(level)
+#define END_PROFILE(level)
+
+#define BEGIN_BOOKMARK(colour)
+#define END_BOOKMARK(colour)
+
+#endif
+
+#endif // INCLUDED_CELLMGR_SPU_PROFILE_H
--- a/materialsystem/ps3gcm/Profile_spu.cpp
+++ b/materialsystem/ps3gcm/Profile_spu.cpp
@@ -0,0 +1,76 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#include "Profile.h"
+#include <spu_intrinsics.h>
+
+//--------------------------------------------------------------------------------------------------
+// Functions
+//--------------------------------------------------------------------------------------------------
+
+/*
+* Insert a marker that is displayed in Tuner
+*/ 
+void insert_bookmark( uint32_t bookmark )
+{
+	__asm__ volatile ("wrch $69, %0" :: "r" (bookmark));
+	// Must wait for 16 cycles
+	__asm__ volatile ("nop;nop;nop;nop;nop;nop;nop;nop");
+	__asm__ volatile ("nop;nop;nop;nop;nop;nop;nop;nop");
+}
+
+void bookmark_delay( int NumBookmarks )
+{
+	// 400 cycles per bookmark when emitting bookmarks on both SPUs
+	for ( int i=0; i<NumBookmarks*400/8; i++)
+	{
+		__asm__ volatile ("nop;nop;nop;nop;nop;nop;nop;nop");
+	}
+}
+
+/*
+* Inserting 6 SPU bookmarks, which will
+* be identified by Tuner as a start event
+*/
+void raw_spu_prof_start( int iLevel, uint16_t lsa )
+{
+	typedef union { char c4[4]; uint16_t u16[2]; uint32_t u32; } Module_u;
+	static Module_u s_mu = { { 't', 'e', 's', 't' } };
+
+	insert_bookmark( 0xffaa );		// start marker 1
+	insert_bookmark( s_mu.u16[0] );	// name
+	insert_bookmark( s_mu.u16[1] );	// name
+	insert_bookmark( iLevel );		// level
+	insert_bookmark( lsa >> 2 );	// LSA is shifted by 2 as per the SPURS spec.
+	insert_bookmark( 0xffab );		// start marker 2
+	bookmark_delay( NUM_BOOKMARKS_IN_EVENT );
+}
+
+/*
+* Inserting 6 SPU bookmarks, which will
+* be identified by Tuner as a stop event
+*/
+void raw_spu_prof_stop( uint16_t lsa )
+{
+	typedef union { uint16_t u16[4]; uint64_t u64; } GUID_u;
+	GUID_u guid;
+
+	qword insn = si_roti(*(qword*)(0x80 + lsa), 7);
+	qword pattern = (qword)(vec_uchar16){0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13};
+	guid.u64 = si_to_ullong(si_shufb(insn, insn, pattern));
+
+	insert_bookmark( 0xffac );		// start marker 1
+	insert_bookmark( guid.u16[0] );	// guid
+	insert_bookmark( guid.u16[1] );	// guid
+	insert_bookmark( guid.u16[2] );	// guid
+	insert_bookmark( guid.u16[3] );	// guid
+	insert_bookmark( 0xffad );		// start marker 2
+	bookmark_delay( NUM_BOOKMARKS_IN_EVENT );
+}
--- a/materialsystem/ps3gcm/SpuMgr_dma.h
+++ b/materialsystem/ps3gcm/SpuMgr_dma.h
@@ -0,0 +1,98 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+#ifndef INCLUDED_SPUMGR_DMA_H
+#define INCLUDED_SPUMGR_DMA_H
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#include <stdint.h>
+
+#ifdef SPU
+//#include <Stdshader_spu/Inc/debug_spu.h> // MH
+#else
+#include <debug/inc/debug.h>
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// Defines
+//--------------------------------------------------------------------------------------------------
+
+#define SPUMGR_IS_ALIGNED(val, align)	(((val) & ((align) - 1)) == 0)
+#define SPUMGR_ALIGN_UP(val, align)		(((val) + ((align)-1)) & ~((align) - 1))
+#define SPUMGR_ALIGN_DOWN(val, align)	((val) & ~((align) - 1))
+
+#define SPUMGR_MSG_MEMCPY				0x000000ff
+
+#define Assert(val) // MH
+
+//--------------------------------------------------------------------------------------------------
+// Types
+//--------------------------------------------------------------------------------------------------
+
+struct MemCpyHeader
+{
+	uint32_t src;
+	uint32_t dst;
+	uint32_t size;
+	uint32_t blocking;
+	uint8_t	 cacheLine[16];
+};
+
+//--------------------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------------------
+
+struct DMAList
+{
+	uint32_t stallAndNotify	:1;
+	uint32_t reserved		:16;
+	uint32_t size			:15;
+	uint32_t ea;
+};
+
+//--------------------------------------------------------------------------------------------------
+// DmaCheckAlignment
+//	Checks restrictions specified in SpuMgr::DmaGet
+//--------------------------------------------------------------------------------------------------
+
+int DmaCheckAlignment(uint32_t src, uint32_t dest, uint32_t size);
+
+//--------------------------------------------------------------------------------------------------
+//SetupDmaListEntry 
+//
+//	Note that this function increments input ptr by number of entries added,
+//	which will be > 1 if size > 16K
+//--------------------------------------------------------------------------------------------------
+
+inline void SetupDmaListEntry(uint32_t stall, uint32_t ea, uint32_t size, DMAList **pDmaList)
+{
+	// check alignment; don't pass in NULL for dest
+	if (!DmaCheckAlignment(ea, 0x10, size))
+	{
+		Assert(0);
+	}
+
+	Assert((size & 0xF) == 0);	// for lists input sizes must be multiple of 16 bytes
+
+	while (size)
+	{
+		uint32_t dmaSize = 0x4000;
+		dmaSize = size < dmaSize? size: dmaSize;
+
+		(*pDmaList)->stallAndNotify = stall;
+		(*pDmaList)->size			= dmaSize;
+		(*pDmaList)->ea				= ea;
+
+		size -= dmaSize;
+		ea += dmaSize;
+		(*pDmaList)++;
+	}
+}
+
+#endif // INCLUDED_SPUMGR_DMA_H
--- a/materialsystem/ps3gcm/SpuMgr_ppu.cpp
+++ b/materialsystem/ps3gcm/SpuMgr_ppu.cpp
@@ -0,0 +1,628 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#include "sys/memory.h"
+#include "sysutil/sysutil_sysparam.h"
+#include "cell/sysmodule.h"
+
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "tier1/utlbuffer.h"
+
+#include <sys/timer.h>
+#include <sys/spu_image.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cell/cell_fs.h>
+#include <cell/atomic.h>
+#include <string.h>
+
+#include "ps3_pathinfo.h"
+
+#include <cell/spurs/control.h>
+
+#include "SpuMgr_ppu.h"
+
+#include "memdbgon.h"
+
+typedef uint32_t uint32;
+
+#define ASSERT Assert
+
+//--------------------------------------------------------------------------------------------------
+// Defines
+//--------------------------------------------------------------------------------------------------
+
+// Spu Mailbox Status Register
+// Described in CBE architecture chapter 8.6.3 SPU Mailbox Status Register (SPU_Mbox_Stat)
+
+#define SPU_IN_MBOX_COUNT_SHIFT (8) 
+#define SPU_IN_MBOX_COUNT (0xFF << SPU_IN_MBOX_COUNT_SHIFT)
+
+#define SPU_OUT_MBOX_COUNT (0xFF)
+
+#define SPU_OUT_INTR_MBOX_COUNT_SHIFT (16) 
+#define SPU_OUT_INTR_MBOX_COUNT (0xFF << SPU_OUT_INTR_MBOX_COUNT_SHIFT)
+
+//--------------------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------------------
+
+// SPU manager instance
+SpuMgr gSpuMgr;
+
+//--------------------------------------------------------------------------------------------------
+// DmaCheckAlignment
+//	Checks restrictions specified in SpuMgr::DmaGet
+//--------------------------------------------------------------------------------------------------
+
+int DmaCheckAlignment(uint32_t src, uint32_t dest, uint32_t size)
+{
+#if !defined( _CERT )
+
+	uint32_t align = size;
+	bool error = false;
+
+	if (size >= 16 && ((size & 0xf) == 0))
+	{
+		align = 16;                
+	}
+	else if (size == 8 || size == 4 || size == 2 || size == 1)
+	{
+		error = ((src & 0xF) != (dest & 0xF));
+	}
+	else
+	{
+		error = true;  // bad size
+	}
+
+	return (!error && src && dest &&
+		SPUMGR_IS_ALIGNED(src, align) &&
+		SPUMGR_IS_ALIGNED(dest, align));
+
+#else //!_CERT
+	return 1;
+#endif //!_CERT
+}
+
+//--------------------------------------------------------------------------------------------------
+// Internal functions
+//--------------------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------------------
+// handle_syscall
+//
+//	interrupt handler to handle SPU interrupts
+//	see Handle SPU Interrupts Lv2-Uders_manual_e P34
+//--------------------------------------------------------------------------------------------------
+
+void handle_syscall (uint64_t arg)
+{
+	sys_raw_spu_t id = arg;
+	uint64_t stat;
+	int ret;
+
+#ifndef _CERT
+	g_snRawSPULockHandler();
+#endif
+
+	// Create a tag to handle class 2 interrupt, SPU halts fall in
+	// this category
+
+	ret = sys_raw_spu_get_int_stat(id, 2, &stat);
+	if (ret)
+	{
+#ifndef _CERT
+		g_snRawSPUUnlockHandler();
+#endif
+		sys_interrupt_thread_eoi();
+	}
+
+	//
+	// SPU Stop-and-Signal Instruction Trap
+	// This interrupt occurs when the SPU executes a stop-and-signal 
+	// instruction.
+	//
+	
+	if (stat & INTR_STOP_MASK)	//stop
+	{
+		//We've hit a stop, so what kind of value is it?
+		uint32_t signalVal = GetStopSignal( id );
+	
+		switch ( signalVal )
+		{
+		case 0x3:
+
+			// it was a stop that is in the SPU code to signal to the PPU
+
+			// do any processing for the user defined stop here
+			// if we do not restart the SPU then we need to call g_snRawSPUNotifySPUStopped(id) 
+			// to inform the debugger that SPU has stopped
+
+			//restart the SPU
+			sys_raw_spu_mmio_write( id, SPU_RunCntl, 0x1 );
+			break;
+
+		default:
+#ifndef _CERT
+			g_snRawSPUNotifySPUStopped(id);
+#endif
+			break;
+		}
+	}
+	else if (stat & INTR_HALT_MASK)	// halt
+	{
+#ifndef _CERT
+		g_snRawSPUNotifySPUStopped(id);
+#endif
+	}
+
+	// Other class 2 interrupts could be handled here
+	// ...
+
+	//
+	// Must reset interrupt status bit of those not handled.  
+	//
+	ret = sys_raw_spu_set_int_stat(id, 2, stat);
+	if (ret)
+	{
+#ifndef _CERT
+		g_snRawSPUUnlockHandler();
+#endif
+		sys_interrupt_thread_eoi();
+	}
+
+	//
+	// End of interrupt
+	//
+#ifndef _CERT
+	g_snRawSPUUnlockHandler();
+#endif
+	sys_interrupt_thread_eoi();
+}
+
+int CreateDefaultInterruptHandler(SpuTaskHandle *pTask)
+{
+	int res = 0;
+
+	//
+	// Create a SPU interrupt handler thread, an interrupt tag,
+	// and associate it with the thread
+	//
+
+	// create thread
+
+	if (sys_ppu_thread_create(&pTask->m_ppuThread, handle_syscall, 
+		0, INTR_HANDLER_THREAD_PRIORITY, INTR_HANDLER_THREAD_STACK_SIZE, 
+		SYS_PPU_THREAD_CREATE_INTERRUPT, "Interrupt PPU Thread"))
+	{
+		res = 1;
+		goto xit;
+	}
+
+	// create interrupt tag for handling class 2 interrupts from this spu
+
+	if (sys_raw_spu_create_interrupt_tag(pTask->m_spuId, 2, SYS_HW_THREAD_ANY, &pTask->m_intrTag))
+	{
+		res = 1;
+		goto xit;
+	}
+
+	// associate interrupt tag with thread
+
+	if (sys_interrupt_thread_establish(&pTask->m_interruptThread, pTask->m_intrTag, 
+		pTask->m_ppuThread, pTask->m_spuId))
+	{
+		res = 1;
+		goto xit;
+	}
+
+	// Set interrupt mask - enable Halt, Stop-and-Signal interrupts
+	if (sys_raw_spu_set_int_mask(pTask->m_spuId, 2, INTR_STOP_MASK | INTR_HALT_MASK))
+	{
+		res = 1;
+		goto xit;
+	}
+
+xit:
+	return res;
+}
+
+//--------------------------------------------------------------------------------------------------
+// Class Methods
+//--------------------------------------------------------------------------------------------------
+
+int SpuMgr::Init(int numRawSpu)
+{
+	// Need at least 2 SPUs for SPURS instances
+	ASSERT(numRawSpu < 5);
+
+
+	// Run SPURS on all SPUs that are not in raw mode
+
+	// Creating two SPURS instances. One with a thread group of 5 - numRawSpu threads and one
+	// with a thread group of 1 thread. 
+	
+	// The instance with a single thread is designed to be singled out as the preemption victim
+	// when the OS needs to use an SPU. We ensure this by giving it a lower priority than the
+	// dedicated SPURS instance.
+
+
+	// Init dedicated SPUs SPURS instance
+// 	CellSpursAttribute attr;
+// 	int32 ret = cellSpursAttributeInitialize(&attr, 5 - numRawSpu, 99, 2, false);
+// 	ASSERT(ret == CELL_OK);
+// 	ret = cellSpursAttributeEnableSpuPrintfIfAvailable(&attr);
+// 	ASSERT(ret == CELL_OK);
+// 	ret = cellSpursAttributeSetNamePrefix(&attr, "gameSpusSpurs", std::strlen("gameSpusSpurs"));
+// 	ASSERT(ret == CELL_OK);
+// 	ret = cellSpursInitializeWithAttribute2(&m_exclusiveSpusSpurs, &attr);
+// 	ASSERT(ret == CELL_OK);
+
+	// Init pre-emption SPU SPURS instance
+// 	ret = cellSpursAttributeInitialize(&attr, 1, 100, 2, false);
+// 	ASSERT(ret == CELL_OK);
+// 	ret = cellSpursAttributeEnableSpuPrintfIfAvailable(&attr);
+// 	ASSERT(ret == CELL_OK);
+// 	ret = cellSpursAttributeSetNamePrefix(&attr, "sharedSpuSpurs", std::strlen("sharedSpuSpurs"));
+// 	ASSERT(ret == CELL_OK);
+// 	ret = cellSpursInitializeWithAttribute2(&m_preemptedSpuSpurs, &attr);
+// 	ASSERT(ret == CELL_OK);
+
+
+    int res = 0;
+	
+	// set up members
+	m_numSpus = 0;
+	
+	// Initialize SPUs
+	if (sys_spu_initialize(6, numRawSpu) != SUCCEEDED) 
+	{
+		res = 1;
+		goto xit;
+	}
+
+	// Create raw spus
+	for (; m_numSpus < (uint32)numRawSpu; m_numSpus++)
+	{
+		if (sys_raw_spu_create(&m_spuIds[m_numSpus], NULL) != SUCCEEDED)
+		{
+			Error("Unable to create saw spu\n");
+
+			res = 1;
+			goto xit;
+		}
+
+#ifndef _CERT
+		g_snRawSPUNotifyCreation(m_spuIds[m_numSpus]);
+#endif
+		m_spuInUse[m_numSpus] = 0;
+	}
+
+xit:
+	return res;
+}
+
+void SpuMgr::Term()
+{
+	uint32 spu;
+
+	// destroy raw spus
+	for (spu = 0; spu < m_numSpus; spu++)
+	{
+		sys_raw_spu_destroy(m_spuIds[spu]);
+	}
+
+	// destroy the SPURS instances
+// 	int ret;
+// 	ret = cellSpursfinalize(&m_exclusiveSpusSpurs);
+// 	ASSERT(ret == CELL_OK);
+// 
+// 	ret = cellSpursfinalize(&m_preemptedSpuSpurs);
+// 	ASSERT(ret == CELL_OK);
+
+	m_numSpus = 0;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+uint32_t spumgr_mmio_read(uint32_t spu, uint32_t regoffset)
+{
+	uint64_t addr = get_reg_addr(spu,regoffset);
+	addr &= 0xffffffffUL;
+	volatile uint32_t * pAddr = (uint32_t*) addr;
+	return *pAddr;
+}
+
+void spumgr_mmio_write(int spu, int regoffset, uint32_t value)
+{
+	uint64_t addr = get_reg_addr(spu,regoffset);
+	addr &= 0xffffffffUL;
+	volatile uint32_t * pAddr = (uint32_t*) addr;
+	*pAddr = value;
+}
+
+//--------------------------------------------------------------------------------------------------
+// Create Spu task from file based image
+//--------------------------------------------------------------------------------------------------
+
+
+static char modPath[MAX_PATH];
+
+int SpuMgr::CreateSpuTask(const char *path, SpuTaskHandle *pTask, 
+						  CreateSPUTaskCallback *pfnCallback /* = NULL */)
+{
+	int res = 0;
+	int ret;
+	uint32 spu;
+	register uint32 spuid;
+	uint32 entry;
+	FILE* fp;
+	void* pSpuProg = NULL;
+
+	sys_spu_image_t img;
+
+	pTask->m_spuId = -1;
+	pTask->m_ppuThread = NULL;
+	pTask->m_intrTag = NULL;
+	pTask->m_interruptThread = NULL;
+
+	// find free raw spu
+	for (spu = 0; spu < m_numSpus; spu++)
+	{
+		if (!m_spuInUse[spu])
+		{
+			break;
+		}
+	}
+
+	// check we found free spu
+	if (spu == m_numSpus)
+	{
+		res = 1;
+		goto xit;
+	}
+
+	// Loading an SPU program to the Raw SPU.
+	//if (sys_raw_spu_load(m_spuIds[spu], path, &entry) != SUCCEEDED) 
+
+	sprintf(modPath, "%s/%s", g_pPS3PathInfo->PrxPath(), path);
+	path = modPath;
+
+    if(strstr(path,".self"))
+    {
+        ret = sys_spu_image_open(&img, path);
+		if(ret != CELL_OK)
+		{
+			// (Running on Main Thread)
+			Error("Failed to open SPU program: %s\n", path);
+		}
+    }
+    else
+    {
+        // Allocate mem for SPU prog
+
+        CellFsStat stat;
+        cellFsStat(path,&stat);
+        pSpuProg = memalign(4096,((uint32)stat.st_size + 0x7f)&0xffffff80);
+        fp = fopen(path, "rb");
+        fread(pSpuProg, 1, stat.st_size, fp );
+        fclose(fp);
+
+        ret = sys_spu_image_import(&img, pSpuProg, SYS_SPU_IMAGE_PROTECT);
+        if (ret != CELL_OK)
+        {
+            res = 1;
+            goto xit;
+        } 
+    }
+
+    ret = sys_raw_spu_image_load(m_spuIds[spu], &img);
+
+	spuid = m_spuIds[spu];
+
+	if (ret == CELL_OK)
+	{
+		// successfully loaded - mark spu as used and fill in o/p
+		m_spuInUse[spu] = 1;
+		pTask->m_spuId = spuid;
+	}
+	else
+	{
+		res = 1;
+		goto xit;
+	}
+
+	//Free PPU resources used to load image
+    if(pSpuProg)
+    {
+	    free(pSpuProg);
+    }
+	sys_spu_image_close(&img);
+
+	entry = sys_raw_spu_mmio_read((uint32_t)spuid, (uint32_t)SPU_NPC);
+
+#ifndef _CERT
+	g_snRawSPUNotifyElfLoad(spuid, entry, path);
+#endif
+
+	// call callback or create default interrupt handler
+	if (!pfnCallback)
+	{
+		res = CreateDefaultInterruptHandler(pTask);
+	}
+	else
+	{
+		res = pfnCallback(pTask);
+	}
+
+	if (res)
+	{
+		goto xit;
+	}
+
+	// Run the Raw SPU 
+	
+#ifndef _CERT
+	g_snRawSPUNotifySPUStarted(m_spuIds[spu]);
+#endif
+	sys_raw_spu_mmio_write(spuid, SPU_NPC, entry);
+	sys_raw_spu_mmio_write(spuid, SPU_RunCntl, 0x1);
+	__asm("eieio");
+
+	// Once the SPU has started, write a mailbox with the effective address of the
+	// SPU lock.
+	WriteMailbox( pTask, (uint32) &pTask->m_lock );
+	WriteMailbox( pTask, (uint32) &pTask->m_memcpyLock );
+
+xit:
+    if(res)
+    {
+        // Error("Error: CreateSpuTask error attempting to load and run %s on SPU\n", path);
+    }
+	return res;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::DestroySpuTask(SpuTaskHandle *pTask)
+{
+	if (pTask->m_spuId != -1)
+	{
+		// Stop the Raw spu
+
+#ifndef _CERT
+		g_snRawSPUNotifySPUStopped(pTask->m_spuId);
+#endif
+
+		sys_raw_spu_mmio_write(pTask->m_spuId, SPU_RunCntl, 0x0);
+		__asm("eieio");
+
+		// Cleanup interrupt handling mechanism
+
+		if (pTask->m_interruptThread)
+		{
+			sys_interrupt_thread_disestablish(pTask->m_interruptThread);	// also kills the thread
+		}
+		
+		if (pTask->m_intrTag)
+		{
+			sys_interrupt_tag_destroy(pTask->m_intrTag);
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+int SpuMgr::WriteMailbox(SpuTaskHandle *pTask, uint32 val, bool bBlocking /* =true */)
+{
+	uint32 mboxAvailable;
+
+	do
+	{
+		// Check the SPU Mailbox Status Register
+		mboxAvailable = sys_raw_spu_mmio_read(pTask->m_spuId, SPU_MBox_Status) & SPU_IN_MBOX_COUNT;
+	} while (bBlocking && !mboxAvailable);
+
+	if (mboxAvailable)
+		sys_raw_spu_mmio_write(pTask->m_spuId, SPU_In_MBox, (std::uint32_t)val);
+
+	return !mboxAvailable;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+int SpuMgr::ReadMailbox(SpuTaskHandle *pTask, uint32 *pVal, bool bBlocking /* = true */)
+{
+	uint32 mailAvailable;
+
+	do
+	{
+		// Check the SPU Mailbox Status Register
+		mailAvailable = sys_raw_spu_mmio_read(pTask->m_spuId, SPU_MBox_Status) & SPU_OUT_MBOX_COUNT;
+	} while (bBlocking && !mailAvailable);
+
+	if (mailAvailable)
+	{
+		// Read the SPU Outbound Mailbox Register
+		*pVal = sys_raw_spu_mmio_read(pTask->m_spuId, SPU_Out_MBox);
+	}
+
+	return !mailAvailable;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+int SpuMgr::ReadIntrMailbox(SpuTaskHandle *pTask, uint32 *pVal, bool bBlocking /* = true */)
+{
+	uint32 mailAvailable;
+
+	do
+	{
+		// Check the SPU Mailbox Status Register
+		mailAvailable = sys_raw_spu_mmio_read(pTask->m_spuId, SPU_MBox_Status) & SPU_OUT_INTR_MBOX_COUNT;
+
+	} while (bBlocking && !mailAvailable);
+
+	if (mailAvailable)
+	{
+		// Read the SPU Outbound Mailbox Register
+		sys_raw_spu_read_puint_mb(pTask->m_spuId, pVal);
+	}
+
+	return !mailAvailable;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+bool SpuMgr::Lock( SpuTaskHandle *pTask )
+{
+	return cellAtomicCompareAndSwap32( &pTask->m_lock, 0, 1 ) == 0;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::Unlock( SpuTaskHandle *pTask )
+{
+	cellAtomicCompareAndSwap32( &pTask->m_lock, 1, 0 );
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+bool SpuMgr::MemcpyLock( SpuTaskHandle *pTask )
+{
+	return cellAtomicCompareAndSwap32( &pTask->m_memcpyLock, 0, 1 ) == 0;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::MemcpyUnlock( SpuTaskHandle *pTask )
+{
+	cellAtomicCompareAndSwap32( &pTask->m_memcpyLock, 1, 0 );
+}
+
--- a/materialsystem/ps3gcm/SpuMgr_ppu.h
+++ b/materialsystem/ps3gcm/SpuMgr_ppu.h
@@ -0,0 +1,238 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Raw SPU management
+//
+//==================================================================================================
+
+#ifndef INCLUDED_SPUMGR_PPU_H
+#define INCLUDED_SPUMGR_PPU_H
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#include <sys/spu_initialize.h>
+#include <sys/raw_spu.h>
+#include <sys/spu_utility.h>
+#include <sys/ppu_thread.h>
+#include <sys/interrupt.h>
+#include <sys/raw_spu.h>
+#include <sys/sys_time.h>
+
+#include <cell/spurs.h>
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+extern "C"
+{
+	extern void (*g_snRawSPULockHandler) (void);
+	extern void (*g_snRawSPUUnlockHandler) (void);
+	extern void (*g_snRawSPUNotifyCreation) (unsigned int uID);
+	extern void (*g_snRawSPUNotifyDestruction) (unsigned int uID);
+	extern void (*g_snRawSPUNotifyElfLoad) (unsigned int uID, unsigned int uEntry, const char *pFileName);
+	extern void (*g_snRawSPUNotifyElfLoadNoWait) (unsigned int uID, unsigned int uEntry, const char *pFileName);
+	extern void (*g_snRawSPUNotifyElfLoadAbs) (unsigned int uID, unsigned int uEntry, const char *pFileName);
+	extern void (*g_snRawSPUNotifyElfLoadAbsNoWait) (unsigned int uID, unsigned int uEntry, const char *pFileName);
+	extern void (*g_snRawSPUNotifySPUStopped) (unsigned int uID);
+	extern void (*g_snRawSPUNotifySPUStarted) (unsigned int uID);
+};
+
+//--------------------------------------------------------------------------------------------------
+// Fwd refs
+//--------------------------------------------------------------------------------------------------
+
+class CellSpurs2;
+class SpuTaskHandle;
+
+//--------------------------------------------------------------------------------------------------
+// Defines
+//--------------------------------------------------------------------------------------------------
+
+#define MAX_RAW_SPUS 5
+
+// Class 2 Interrupt Status Register (INT_Stat_class2)
+// Described in CBE architecture v10 on page 259
+#define INTR_PPU_MB_SHIFT   0
+#define INTR_STOP_SHIFT     1
+#define INTR_HALT_SHIFT     2
+#define INTR_DMA_SHIFT      3
+#define INTR_SPU_MB_SHIFT   4 
+#define INTR_PPU_MB_MASK    (0x1 << INTR_PPU_MB_SHIFT)
+#define INTR_STOP_MASK      (0x1 << INTR_STOP_SHIFT)
+#define INTR_HALT_MASK      (0x1 << INTR_HALT_SHIFT)
+#define INTR_DMA_MASK       (0x1 << INTR_DMA_SHIFT)
+#define INTR_SPU_MB_MASK    (0x1 << INTR_SPU_MB_SHIFT)
+
+// thread priority for interrupt handler threads
+#define INTR_HANDLER_THREAD_PRIORITY	200
+#define INTR_HANDLER_THREAD_STACK_SIZE  0x4000
+
+#define SPUMGR_IS_ALIGNED(val, align)	(((val) & ((align) - 1)) == 0)
+#define SPUMGR_ALIGN_UP(val, align)		(((val) + ((align)-1)) & ~((align) - 1))
+#define SPUMGR_ALIGN_DOWN(val, align)	((val) & ~((align) - 1))
+
+//--------------------------------------------------------------------------------------------------
+// Overide sys_raw_spu_mmio_read / write, since they draw out another bug in SNC :(
+//--------------------------------------------------------------------------------------------------
+
+#define sys_raw_spu_mmio_read(spu, regoffset) spumgr_mmio_read(spu, regoffset)
+extern uint32_t spumgr_mmio_read(uint32_t spu, uint32_t regoffset);
+
+#define sys_raw_spu_mmio_write(spu, regoffset, value) spumgr_mmio_write(spu, regoffset, value)
+extern void spumgr_mmio_write(int id, int offset, uint32_t value);
+
+//--------------------------------------------------------------------------------------------------
+// Types
+//--------------------------------------------------------------------------------------------------
+
+typedef int CreateSPUTaskCallback(SpuTaskHandle *pTask);
+
+// SpuStatusRegister
+// Described in CBE architecture v10 on page 87
+typedef union SpuStatusRegister
+{
+	struct
+	{
+		uint32_t	m_sc								: 16;
+		uint32_t	m_reserved2							: 5;
+		uint32_t	m_isolateExitStatus					: 1;
+		uint32_t	m_isolateLoadStatus					: 1;
+		uint32_t	m_reserved1							: 1;
+		uint32_t	m_isolationStatus					: 1;
+		uint32_t	m_illegalChannelInstructionDetected	: 1;
+		uint32_t	m_invalidInstructionDetected		: 1;
+		uint32_t	m_singleStepStatus					: 1;
+		uint32_t	m_waitStatus						: 1;
+		uint32_t	m_haltStatus						: 1;
+		uint32_t	m_programStopAndSignalStatus		: 1;
+		uint32_t	m_runStatus							: 1;
+	};
+	uint32_t	m_val;
+} SpuStatusRegister;
+
+//--------------------------------------------------------------------------------------------------
+// Classes
+//--------------------------------------------------------------------------------------------------
+
+class SpuTaskHandle
+{
+public:
+	sys_raw_spu_t					m_spuId;
+	sys_ppu_thread_t				m_ppuThread;
+	sys_interrupt_tag_t				m_intrTag;
+	sys_interrupt_thread_handle_t	m_interruptThread;
+	uint32_t							m_lock;
+	uint32_t							m_memcpyLock;
+};	
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr
+// 
+// Provides functionality for running raw spu tasks. For this purpose it creates
+// and manages a raw spu pool
+// 
+// Currently we assume a simple setup where app loads an elf on to a raw spu,
+// after which the spu starts running the elf and continues to do so thereafter.
+// The ppu->spu and spu->ppu communication is explicitly handled by the app 
+// and the spu program using SpuMgr methods
+// 
+// Currently all DMA transfer is supposed to be initiated by the SPUs which is 
+// why SpuMgr does not provide any DMA functionality
+//--------------------------------------------------------------------------------------------------
+
+class SpuMgr
+{
+public:
+
+	// Init/Term
+
+	int Init(int numRawSpu);	
+	void Term();
+
+	// Create/Destroy tasks
+
+	int CreateSpuTask(const char *path, SpuTaskHandle *pTask, CreateSPUTaskCallback *pfnCallback = NULL);	
+	void DestroySpuTask(SpuTaskHandle *pTask);
+
+	//
+	// Helper functions to communicate with the SPU
+	// As we build more functionality into the SPU mgr it is
+	// possible that we will need to expose less of 
+	// these low-level functions
+	//
+
+	//
+	// Mailbox functions
+	//
+
+	//
+	// The SPU Inbound Mailbox is a 4-level FIFO structure for communication from the 
+	// PPU to SPU, and can	hold up to four 32-bit messages. 
+	// If there are already four messages in the mailbox the last message will be 
+	// overwritten...but we can check for a full mailbox and prevent this
+	int WriteMailbox(SpuTaskHandle *pTask, uint32_t val, bool bBlocking = true);
+
+	// The SPU Outbound Mailbox can hold one 32-bit message for SPU-to-PPU communication.
+	int ReadMailbox(SpuTaskHandle *pTask, uint32_t *pVal, bool bBlocking = true);
+
+	// The SPU Outbound Interrupt Mailbox can hold one 32-bit message for SPU-to-PPU communication.
+	int ReadIntrMailbox(SpuTaskHandle *pTask, uint32_t *pVal, bool bBlocking = true);
+	//
+	// Access to local store - note that this involves MMIO which will be slow
+	// so need to use DMA instead for any significant data transfer. This
+	// mechanism may be useful for writing some small amount of data such
+	// as some constants etc into LS
+	//
+
+	int WriteLS(SpuTaskHandle *pTask, uint32_t lsOffset, void *pData, uint32_t size);
+	int ReadLS(SpuTaskHandle *pTask, uint32_t lsOffset, void *pData, uint32_t size);
+
+	bool Lock( SpuTaskHandle *pTask );
+	void Unlock( SpuTaskHandle *pTask );
+
+	bool MemcpyLock( SpuTaskHandle *pTask );
+	void MemcpyUnlock( SpuTaskHandle *pTask );
+
+
+// 	CellSpurs2		m_exclusiveSpusSpurs;		// SPURS instance running on SPUs used exclusively by the application
+// 	CellSpurs2		m_preemptedSpuSpurs;		// SPURS instance running on an SPU shared with the OS (may be preempted by it occasionally)
+
+private:
+
+	uint32_t				m_numSpus;
+	uint32_t				m_spuInUse[MAX_RAW_SPUS];
+	sys_raw_spu_t	m_spuIds[MAX_RAW_SPUS];
+
+	int	ReadMailboxChannel(SpuTaskHandle *pTask, uint32_t *pVal, 
+		uint32_t countMask, uint32_t channel, bool bBlocking = true);
+};
+
+//--------------------------------------------------------------------------------------------------
+// Externs
+//--------------------------------------------------------------------------------------------------
+
+extern SpuMgr gSpuMgr;
+
+//--------------------------------------------------------------------------------------------------
+// DmaCheckAlignment
+//	Checks restrictions specified in SpuMgr::DmaGet
+//--------------------------------------------------------------------------------------------------
+
+int DmaCheckAlignment(uint32_t src, uint32_t dest, uint32_t size);
+
+
+//--------------------------------------------------------------------------------------------------
+// GetStopSignal
+//--------------------------------------------------------------------------------------------------
+
+inline uint32_t GetStopSignal( sys_raw_spu_t idSpu )
+{
+	SpuStatusRegister status;
+	status.m_val = sys_raw_spu_mmio_read(idSpu, SPU_Status);
+	uint32_t stopSignal = status.m_sc;
+	return stopSignal;
+}
+
+#endif // INCLUDED_SPUMGR_PPU_H
--- a/materialsystem/ps3gcm/SpuMgr_spu.cpp
+++ b/materialsystem/ps3gcm/SpuMgr_spu.cpp
@@ -0,0 +1,485 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#include "SpuMgr_spu.h"
+#include <cell/atomic.h>
+
+#ifndef _CERT
+#include <libsn_spu.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+
+//--------------------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------------------
+
+// singleton instance
+SpuMgr gSpuMgr __attribute__((aligned(128)));
+unsigned char gUnalignedMem[16] __attribute__((aligned(16)));
+MemCpyHeader gMemCpyHeader __attribute__((aligned(16)));
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+void SPU_memcpy( void *pBuf1, void *pBuf2 )
+{	
+	uint32_t header;
+
+	gSpuMgr.ReadMailbox( &header );
+
+	gSpuMgr.MemcpyLock();
+
+	gSpuMgr.DmaGetUNSAFE( &gMemCpyHeader, header, sizeof( MemCpyHeader ), 0 );
+	gSpuMgr.DmaDone( 0x1 );
+
+	DEBUG_ERROR( ( gMemCpyHeader.src & 0xf ) == 0 );
+
+	uint32_t sizeAligned;
+	uint32_t sizeAlignedDown;
+	uint32_t dstAlignedDown;
+	uint32_t offset;
+
+	memcpy( gUnalignedMem, gMemCpyHeader.cacheLine, 16 );
+
+	while ( gMemCpyHeader.size > 8192 )
+	{
+		sizeAligned		= 8192;
+		dstAlignedDown	= SPUMGR_ALIGN_DOWN( gMemCpyHeader.dst, 16 );
+		offset			= gMemCpyHeader.dst - dstAlignedDown;
+
+		gSpuMgr.DmaGetUNSAFE( pBuf1, gMemCpyHeader.src, sizeAligned, 0 );
+		gSpuMgr.DmaDone( 0x1 );
+
+		if ( offset )
+		{
+			memcpy( pBuf2, gUnalignedMem, offset );
+		}
+
+		memcpy( (void *) ( (uint32_t) pBuf2 + offset ), pBuf1, sizeAligned );
+
+		gSpuMgr.DmaSync();
+		gSpuMgr.DmaPut( dstAlignedDown, pBuf2, SPUMGR_ALIGN_UP( sizeAligned + offset, 16 ), 0 );
+		gSpuMgr.DmaDone( 0x1 );
+
+		sizeAlignedDown = SPUMGR_ALIGN_DOWN( sizeAligned + offset, 16 );
+		memcpy( gUnalignedMem, (void *) ( (uint32_t) pBuf2 + sizeAlignedDown ), 16 );
+
+		gMemCpyHeader.size -= sizeAligned;
+
+		gMemCpyHeader.dst += 8192;
+		gMemCpyHeader.src += 8192;
+	}
+
+	sizeAligned		= SPUMGR_ALIGN_UP( gMemCpyHeader.size, 16 );
+	dstAlignedDown	= SPUMGR_ALIGN_DOWN( gMemCpyHeader.dst, 16 );
+	offset			= gMemCpyHeader.dst - dstAlignedDown;
+
+	gSpuMgr.DmaGetUNSAFE( pBuf1, gMemCpyHeader.src, sizeAligned, 0 );
+	gSpuMgr.DmaDone( 0x1 );
+
+	if ( offset )
+	{
+		memcpy( pBuf2, gUnalignedMem, offset );
+	}
+
+	memcpy( (void *) ( (uint32_t) pBuf2 + offset ), pBuf1, gMemCpyHeader.size );
+
+	sizeAligned = SPUMGR_ALIGN_UP( gMemCpyHeader.size + offset, 16 );
+
+	gSpuMgr.DmaSync();
+	gSpuMgr.DmaPut( dstAlignedDown, pBuf2, sizeAligned, 0 );
+	gSpuMgr.DmaDone( 0x1 );
+
+	if ( gMemCpyHeader.blocking )
+	{
+		gSpuMgr.WriteMailbox( 0 );
+	}
+
+	gSpuMgr.MemcpyUnlock();
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// DmaCheckAlignment
+//   
+//   	Checks restrictions specified in SpuMgr::DmaGet
+//--------------------------------------------------------------------------------------------------
+
+int DmaCheckAlignment(uint32_t src, uint32_t dest, uint32_t size)
+{
+#if !defined( _CERT )
+	
+	uint32_t align = size;
+	bool error = false;
+
+	if (size >= 16 && ((size & 0xf) == 0))
+	{
+		align = 16;                
+	}
+	else if (size == 8 || size == 4 || size == 2 || size == 1)
+	{
+		error = ((src & 0xF) != (dest & 0xF));
+	}
+	else
+	{
+		error = true;  // bad size
+	}
+
+	return (!error && src && dest &&
+			SPUMGR_IS_ALIGNED(src, align) &&
+			SPUMGR_IS_ALIGNED(dest, align));
+			
+
+#else //!FINAL
+	return 1;
+#endif //!FINAL
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+int SpuMgr::Init()
+{
+	// Start the decrementer since it is possible
+	// that it has not been started by default
+
+	const unsigned int kEventDec = 0x20;
+
+	// Disable the decrementer event.
+	unsigned int maskEvents = spu_readch(SPU_RdEventStatMask);
+	spu_writech(SPU_WrEventMask, maskEvents & ~kEventDec);
+
+	// Acknowledge any pending events and stop the decrementer.
+	spu_writech(SPU_WrEventAck, kEventDec);
+
+	// Write the decrementer value to start the decrementer.
+	unsigned int decValue = spu_readch(SPU_RdDec);
+	spu_writech(SPU_WrDec, decValue);
+
+	// Enable events.
+	spu_writech(SPU_WrEventMask, maskEvents | kEventDec);
+
+	// Reset byte count
+	ResetBytesTransferred();
+
+	// reset malloc count
+	m_mallocCount = 0;
+
+	// Read the effective address of the SPU locks.
+	ReadMailbox( &m_lockEA );
+	ReadMailbox( &m_memcpyLockEA );
+
+	return 0;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::Term()
+{
+}
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr::DmaGet
+//
+// DmaGet       - alignment and size checking
+// DmaGetUNSAFE - no alignment or size checking (but will assert in debug)
+// _DmaGet  - handles badly aligned dma's, should be a private member really (doesn't handle small dma's)
+//
+// DMA restrictions
+//		An MFC supports naturally aligned DMA transfer sizes of 1, 2, 4, 
+//		8, and 16 bytes and multiples of 16 bytes
+//		Furthermore, if size is 1, 2, 4, or 8 bytes then lower 4 bits
+//		of LS and EA must match
+//
+//	Note:
+//		Peak performance is achieved for transfers in which both the EA and 
+//		the LSA are 128-byte aligned and the size of the transfer is a multiple 
+//		of 128 bytes.
+//--------------------------------------------------------------------------------------------------
+
+
+void SpuMgr::DmaGetUNSAFE(void *ls, uint32_t ea, uint32_t size, uint32_t tagId)
+{
+	DEBUG_ERROR( ea < 0xd0000000 );
+	DEBUG_ERROR( ea );
+	DEBUG_ERROR(DmaCheckAlignment((uint32_t)ls, ea, size));
+
+	// do the dma
+	while (size)
+	{
+		uint32_t dmaSize = 0x4000;
+		dmaSize = (size < dmaSize)? size: dmaSize;
+		size -= dmaSize;
+
+		// kick off dma
+		spu_mfcdma64( (void*)ls, 0, ea, dmaSize, tagId, MFC_GET_CMD);
+		m_numDMATransfers++;
+
+		ls = (void*)((uint32_t)ls + dmaSize);
+		ea += dmaSize;
+	}
+
+	// add up bytes transferred
+	m_bytesRequested   += size;
+	m_bytesTransferred += size;
+}
+
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr::_DmaGet
+//
+//	Internal function - do not call this directly
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::_DmaGet(void *ls, uint32_t ea, uint32_t size, uint32_t tagId)
+{
+	uint32_t unaligned = false;
+	uint32_t eaAligned = (uint32_t)ea;
+	uint32_t sizeAligned = size;
+	uint32_t lsAligned = (uint32_t)ls;
+	uint32_t sizeOffset = 0;
+	char *pTempBuff = NULL;
+
+	// check if src is unaligned
+	if (eaAligned & 0xF)
+	{
+		eaAligned = eaAligned & ~0xF;	// round down
+		sizeOffset = ea - eaAligned;
+		sizeAligned += sizeOffset;
+		unaligned = true;
+	}
+
+	// check if size is unaligned
+	if (sizeAligned & 0xF)
+	{
+		sizeAligned = (sizeAligned + 0xF) & ~0xF;	// round up
+		unaligned = true;
+	}
+
+	// if we have adjusted the size, or if ls is unaligned,
+	// we need to alloc temp buffer
+	if (unaligned || (lsAligned & 0xF))
+	{
+		pTempBuff = (char*)MemAlign(0x10, sizeAligned);
+
+		lsAligned = (uint32_t)pTempBuff;
+		unaligned = true;
+	}
+
+	// add up bytes transferred, for informational purposes
+	m_bytesRequested += size;
+	m_bytesTransferred += sizeAligned;
+
+	// do the dma
+	while (sizeAligned)
+	{
+		uint32_t dmaSize = 0x4000;
+		dmaSize = (sizeAligned < dmaSize)? sizeAligned: dmaSize;
+		sizeAligned -= dmaSize;
+
+		// kick off dma
+		spu_mfcdma64( (void*)lsAligned, 0, eaAligned, dmaSize, tagId, MFC_GET_CMD);
+		m_numDMATransfers++;
+
+		lsAligned += dmaSize;
+		eaAligned += dmaSize;
+	}
+
+	if (unaligned)
+	{
+		// block for now till dma done because we do the memcpy right here
+		DmaDone(1 << tagId);
+
+		// copy data over
+		memcpy(ls, pTempBuff + sizeOffset, size);
+
+		// free temp buff
+		Free(pTempBuff);
+	}
+}
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr::DmaGetSAFE
+//
+//	DMA restrictions (look at SpuMgr::DmaGetUNSAFE in this file) are 
+//	handled transparently by this function
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::DmaGetSAFE(void *ls, uint32_t ea, uint32_t size, uint32_t tagId)
+{
+	DEBUG_ERROR( ea );
+
+	if( size < 0x10 )
+	{
+		// lowest 4 bits of address have to match regardless, &
+		// size can only be 1, 2, 4 or 8 B
+
+		if( size==0x1 || size==0x2 || size==0x4 || size==0x8 )
+		{
+			if( ((uint32_t)ls&0xF == ea&0xF) )
+			{
+				DmaGetUNSAFE(ls,ea,size,tagId);
+			}
+			else
+			{
+				// small get not aligned within a 16B block
+				_DmaGet(ls,ea,size,tagId);
+			}
+		}
+		else
+		{
+			// if < 16B can only get 1,2,4 or 8B
+			_DmaGet(ls,ea,size,tagId);
+		}
+	}
+	else
+	{
+		if( (!(size & 0xF)) &&			// has to be multiple of 16B, &
+			(((uint32_t)ls&0xF)==0) &&	// ea and ls have to be 16B aligned
+			((ea&0xF)==0)  )
+		{
+			// alignment is okay just dma
+			DmaGetUNSAFE(ls,ea,size,tagId);
+		}
+		else
+		{
+			_DmaGet(ls,ea,size,tagId);
+		}
+	}
+}
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr::DmaPut
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::DmaPut(uint32_t ea, void *ls, uint32_t size, uint32_t tagId)
+{
+	DEBUG_ERROR( (ea!=0) && (ea<0xd0000000) );	// valid ea
+	DEBUG_ERROR( (uint32_t)ls < 0x40000 );		// valid ls
+	DEBUG_ERROR(DmaCheckAlignment((uint32_t)ls, ea, size));
+	
+	// do the dma
+	while (size)
+	{
+		uint32_t dmaSize = 0x4000;
+		dmaSize = (size < dmaSize)? size: dmaSize;
+		size -= dmaSize;
+
+		// initiate dma to ppu
+		spu_mfcdma64( ls, 0, ea, dmaSize, tagId, MFC_PUT_CMD);
+
+		ls = (void*)((uint32_t)ls + dmaSize);
+		ea += dmaSize;
+	}
+}
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr::DmaSmallPut
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::DmaSmallPut(uint32_t ea, void *ls, uint32_t size, uint32_t tagId)
+{
+	DEBUG_ERROR( (ea!=0) && (ea<0xd0000000) );	// valid ea
+	DEBUG_ERROR( (uint32_t)ls < 0x40000 );		// valid ls
+	DEBUG_ERROR(DmaCheckAlignment((uint32_t)ls, ea, size));
+
+	uint32_t dmaSize = 1;
+
+	if ((size % 8) == 0)
+	{
+		dmaSize = 8;
+	}
+	else if ((size % 4) == 0)
+	{
+		dmaSize = 4;
+	}
+	else if ((size % 2) == 0)
+	{
+		dmaSize = 2;
+	}
+
+	while (size)
+	{
+		size -= dmaSize;
+
+		// initiate dma to ppu
+		spu_mfcdma64( ls, 0, ea, dmaSize, tagId, MFC_PUT_CMD);
+
+		ls = (void*)((uint32_t)ls + dmaSize);
+		ea += dmaSize;
+	}
+}
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr::DmaGetlist
+//
+// Gather data scattered around main mem, MFC will run through the list, and place the elements (based on ea address and size)
+// contiguously in ls.
+//
+// NOTE: if an individual list element size is <16B, the data will still be dma'd but the proceeding element will be placed 
+// on the next 16B boundary. So it is possible to get lots of small elements, but you will be left with gaps in ls.
+//
+// ls - ls address of where items will be placed (contiguously)
+// lsList - ls address of actual list
+// sizeList - size of list in bytes (each list element is 8B (sizeof(DMAList)), so sizeList should be number of list elements // sizeof(DMAList))
+// tagId - works the same way as regular DMA's
+//
+// Alignment and Size Restrictions:
+// -ls and lsList must be 8B aligned
+// -size must be a multiple of 8B (sizeof(DMAList))
+// -no more than 2048 list elements
+//
+// light error checking right now
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::DmaGetList(void *ls, DMAList *pLS_List, uint32_t sizeList, uint32_t tagId)
+{
+	DEBUG_ERROR( ((uint32_t)pLS_List&0x7) == 0 );	// ls address must be 8B aligned
+	DEBUG_ERROR( ((uint32_t)ls&0x7) == 0 );			// ea so aligned also, due to offset within 16B alignment restrictions
+	DEBUG_ERROR( (sizeList&0x7) == 0 );				// list size is a multiple of 8B
+	DEBUG_ERROR( sizeList<(2048*sizeof(DMAList)));	// no more than 2048 list elements
+
+
+	// initiate dma list
+	spu_mfcdma64( ls, 0, (uint32_t)pLS_List, sizeList, tagId, MFC_GETL_CMD );
+}
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr::DmaGPutlist
+//   
+// Scatter data held contiguously in ls, to main mem
+// 
+//   ls - ls address of where items exist (contiguously) to be scattered back to main mem
+// lsList - ls address of actual list
+// sizeList - size of list in bytes (each list element is 8B (sizeof(DMAList)), so sizeList should be number of list elements * sizeof(DMAList))
+//   tagId - works the same way as regular DMA's
+// 
+// Alignment and Size Restrictions:
+// ls and lsList must be 8B aligned, size must be a multiple of 8B (sizeof(DMAList))
+// 
+//   light error checking right now
+//--------------------------------------------------------------------------------------------------
+
+void SpuMgr::DmaPutList(void *ls, DMAList* pLS_List, uint32_t sizeList, uint32_t tagId)
+{
+	DEBUG_ERROR( ((uint32_t)pLS_List&0x7) == 0 );	// ls address must be 8B aligned
+	DEBUG_ERROR( ((uint32_t)ls&0x7) == 0 );			// ea so aligned also, due to offset within 16B alignment restrictions
+	DEBUG_ERROR( (sizeList&0x7) == 0 );				// list size is a multiple of 8B
+	DEBUG_ERROR( sizeList<(2048*sizeof(DMAList)));	// no more than 2048 list elements
+
+	// initiate dma list
+	spu_mfcdma64( ls, 0, (uint32_t)pLS_List, sizeList, tagId, MFC_PUTL_CMD );
+}
--- a/materialsystem/ps3gcm/SpuMgr_spu.h
+++ b/materialsystem/ps3gcm/SpuMgr_spu.h
@@ -0,0 +1,473 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+#ifndef INCLUDED_SPUMGR_SPU_H
+#define INCLUDED_SPUMGR_SPU_H
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#include <stdint.h>
+#include <string.h>
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+#include <stdlib.h>
+#include <cell/atomic.h>
+
+#include "SpuMgr_dma.h"
+
+#include <libsn_spu.h>
+
+//--------------------------------------------------------------------------------------------------
+// Defines
+//--------------------------------------------------------------------------------------------------
+
+#define DEBUG_ASSERT(val) Assert(val)
+#define DEBUG_ERROR(val) Assert(val)
+
+#define Msg(...)
+#define Error(...)
+#define DebuggerBreak() snPause()
+
+#include <sys/integertypes.h>
+
+//Short aliases
+typedef	int8_t							s8;		
+typedef	uint8_t							u8;		
+typedef	int16_t							s16;	
+typedef	uint16_t						u16;	
+typedef	int32_t							s32;	
+typedef	uint32_t						u32;	
+typedef	uint32_t						u64[2];	
+typedef float							f32;	
+typedef double							f64;	
+typedef int								BOOL;
+
+typedef	s8		int8;		
+typedef	u8		uint8;		
+typedef	s16		int16;	
+typedef	u16		uint16;	
+typedef	s32		int32;	
+typedef	u32		uint32;	
+typedef	u64		uint64;	
+
+typedef unsigned int uintp;
+typedef unsigned int uint;
+typedef vector float fltx4 ;
+#define INT_MAX	0x7fffffff
+
+#define DECL_ALIGN(x)			__attribute__( ( aligned( x ) ) )
+#define ALIGN16 DECL_ALIGN(16)
+#define ALIGN16_POST
+#define ALIGN128 DECL_ALIGN(128)
+#define ALIGN128_POST
+template <typename T>
+inline T AlignValue( T val, uintp alignment )
+{
+	return ( T )( ( ( uintp )val + alignment - 1 ) & ~( alignment - 1 ) );
+}
+#define ALIGN_VALUE( val, alignment ) ( ( val + alignment - 1 ) & ~( alignment - 1 ) ) 
+
+inline bool IsPowerOfTwo( uint x )
+{
+	return ( x & ( x - 1 ) ) == 0;
+}
+
+#define  FORCEINLINE inline /* __attribute__ ((always_inline)) */
+
+#define IsPlatformPS3()		1
+#define IsPlatformPS3_PPU()	0
+#define IsPlatformPS3_SPU()	1
+#define IsPlatformX360()	0
+#define IsPlatformOSX()		0
+
+#define RESTRICT
+#define V_memset			__builtin_memset
+#define V_memcpy			memcpy
+
+void SPU_memcpy( void *pBuf1, void *pBuf2 );
+#define MemAlloc_AllocAligned(size, align) gSpuMgr.MemAlign(align, size)
+
+#define ARRAYSIZE(p)	(sizeof(p)/sizeof(p[0]))
+#define MIN( a, b ) ( ( ( a ) < ( b ) ) ? ( a ) : ( b ) )
+#define MAX( a, b ) ( ( ( a ) > ( b ) ) ? ( a ) : ( b ) )
+
+//--------------------------------------------------------------------------------------------------
+// Task handle
+//--------------------------------------------------------------------------------------------------
+
+class SpuTaskHandle
+{
+public:
+	uint32_t							m_spuId;
+	uint64_t							m_ppuThread;
+	uint32_t							m_intrTag;
+	uint32_t							m_interruptThread;
+	uint32_t							m_lock;
+	uint32_t							m_memcpyLock;
+};	
+
+
+//--------------------------------------------------------------------------------------------------
+// SpuMgr
+//--------------------------------------------------------------------------------------------------
+
+class SpuMgr
+{
+public:
+
+	// Init/Term
+
+	int Init();	
+	void Term();
+
+	// MFC Atomic Update functionality
+	// Currently provides functionality to read/write up to 
+	// one cache line (128 bytes) of main mem
+	inline void MFCAGet(void *ls, uint32_t ea, uint32_t size);
+	inline void MFCAPut(void *ls, uint32_t ea, uint32_t size);
+
+	//
+	// DMA functionality
+	//
+
+	// tagId is a value between 0 and 31 that can be used to group
+	// dma requests together
+
+	void DmaGetSAFE(void *ls, uint32_t ea, uint32_t size, uint32_t tagId);
+	void DmaGetUNSAFE(void *ls, uint32_t ea, uint32_t size, uint32_t tagId);
+	void DmaPut(uint32_t ea, void *ls, uint32_t size, uint32_t tagId);
+	void DmaSmallPut(uint32_t ea, void *ls, uint32_t size, uint32_t tagId);
+
+	void DmaGetList(void *ls, DMAList *pLS_List, uint32_t sizeList, uint32_t tagId);
+	void DmaPutList(void *ls, DMAList* pLS_List, uint32_t sizeList, uint32_t tagId);
+
+	inline int DmaDone(uint32_t dmaTagMask, bool bBlocking = true);
+	
+	// DmaSync
+	// All earlier store instructions are forced to complete 
+	// before proceeding. This function ensures that all stores to
+	// to local storage are visible to the MFC or PPU.
+	inline void DmaSync()
+	{
+		__asm("dsync");
+	}
+	
+	//
+	// Mailbox functions - see SpuMgr_ppu.h for a descrition of mailboxes
+	//
+
+	int WriteMailbox(uint32_t val, bool bBlocking = true);	
+	int WriteIntrMailbox(uint32_t val, bool bBlocking = true);
+
+	int WriteMailboxChannel(uint32_t val, uint32_t channel, bool bBlocking /* = true */);
+	int ReadMailbox(uint32_t *pVal, bool bBlocking = true);
+
+	bool Lock();
+	void Unlock();
+	bool MemcpyLock();
+	void MemcpyUnlock();
+
+	// Decrementer access, for time stamps
+	inline uint32_t ReadDecr(void);
+
+	// mem mgr
+
+	void *Malloc( uint32_t size )
+	{
+		m_mallocCount++;
+		void *ptr = malloc( size );
+		DEBUG_ASSERT( ptr );
+		return ptr;
+	}
+
+	void *MemAlignUNSAFE(uint32_t boundary, uint32_t size )
+	{
+		m_mallocCount++;
+		void *ptr = memalign( boundary, size );
+		return ptr;
+	}
+
+	void *MemAlign( uint32_t boundary, uint32_t size )
+	{
+		void *ptr = MemAlignUNSAFE(boundary, size);
+		DEBUG_ERROR( ptr );
+		return ptr;
+	}
+
+	void Free( void *pData )
+	{
+		m_mallocCount--;
+		free( pData );
+	}
+
+	uint32_t GetMallocCount()
+	{
+		return m_mallocCount;
+	}
+
+	// counters to help us keep track of how much data we are moving
+
+	inline void ResetBytesTransferred()
+	{
+		m_bytesRequested = 0;
+		m_bytesTransferred = 0;
+		m_numDMATransfers = 0;
+	}
+	
+	// Private data and member functions
+	void _DmaGet(void *ls, uint32_t ea, uint32_t size, uint32_t tagId);
+
+	uint32_t m_lock[32] __attribute__ ((aligned(128)));
+	uint32_t m_lockEA;
+	uint32_t m_memcpyLock[32] __attribute__ ((aligned(128)));
+	uint32_t m_memcpyLockTest;// __attribute__ ((aligned(128)));
+	uint32_t m_memcpyLockEA;
+
+	uint32_t m_bytesRequested;
+	uint32_t m_bytesTransferred;
+	uint32_t m_numDMATransfers;
+	uint32_t m_mallocCount;
+
+	uint8_t  m_MFCACacheLine[128]	__attribute__ ((aligned(128)));
+};
+
+//--------------------------------------------------------------------------------------------------
+// 
+//--------------------------------------------------------------------------------------------------
+
+inline void SpuMgr::MFCAGet(void *ls, uint32_t ea, uint32_t size)
+{
+	// get start of cache line
+	uint32_t eaAligned = SPUMGR_ALIGN_DOWN(ea, 0x80);
+
+	// get offset to given ea
+	uint32_t eaOffset = ea - eaAligned;
+
+	// check size to read
+	DEBUG_ASSERT(size + eaOffset <= 0x80);
+
+	// read cache line
+	spu_mfcdma64(&m_MFCACacheLine[0], 0, eaAligned, 128, 0, MFC_GETLLAR_CMD);	
+
+	// wait for completion - this is a blocking read
+	spu_readch(MFC_RdAtomicStat);
+
+	// copy out data
+	memcpy(ls, &m_MFCACacheLine[eaOffset], size);
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline void SpuMgr::MFCAPut(void *ls, uint32_t ea, uint32_t size)
+{
+	// get start of cache line
+	uint32_t eaAligned = SPUMGR_ALIGN_DOWN(ea, 0x80);
+
+	// get offset to given ea
+	uint32_t eaOffset = ea - eaAligned;
+
+	// check size to write
+	DEBUG_ASSERT(size + eaOffset <= 0x80);
+
+	// atmoic update - read cache line and reserve it, update it, 
+	// conditionally write it back until write succeeds
+	// if write succeeds then spu_readch(MFC_RdAtomicStat) returns 0
+	do 
+	{
+		// read cache line
+		spu_mfcdma64(&m_MFCACacheLine[0], 0, eaAligned, 128, 0, MFC_GETLLAR_CMD);	
+
+		// wait for completion - this is a blocking read
+		spu_readch(MFC_RdAtomicStat);
+
+		spu_dsync();
+
+		// update cache line
+		memcpy(&m_MFCACacheLine[eaOffset], ls, size);
+
+		// dsync to make sure it's commited to LS
+		spu_dsync();
+
+		// write it back
+		spu_mfcdma64(&m_MFCACacheLine[0], 0, eaAligned, 128, 0, MFC_PUTLLC_CMD);
+
+	} while (__builtin_expect(spu_readch(MFC_RdAtomicStat), 0));
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline int SpuMgr::DmaDone(uint32_t dmaTagMask, bool bBlocking /*=true*/)
+{
+	// From Cell Broadband Engine Architecture V1.0 Chapter 9.3.1 "Procedures for Determining the Status of Tag Groups"
+	//
+	// For polling for the completion of an MFC command or for the completion of a group of MFC commands, the
+	// basic procedure is as follows:
+	//	1. Clear any pending tag status update requests by:
+	//		<09> Writing a <20>0<EFBFBD> to the MFC Write Tag Status Update Request Channel (see page 116)
+	//		<09> Reading the channel count associated with the MFC Write Tag Status Update Request Channel (see
+	//		  page 116), until a value of <20>1<EFBFBD> is returned
+	//		<09> Reading the MFC Read Tag-Group Status Channel (see page 117) and discarding the tag status
+	//		  data.
+	//	2. Enable the tag groups of interest by writing the MFC Write Tag-Group Query Mask Channel (see page
+	//	   114) with the appropriate mask data (only needed if a new tag-group mask is required).
+	//	3. Request an immediate tag status update by writing the MFC Write Tag Status Update Request Channel
+	//	   (see page 116) with a value of <20>0<EFBFBD>.
+	//	4. Perform a read of the MFC Read Tag-Group Status Channel (see page 117). The data returned is the
+	//	   current status of each tag group with the tag-group mask applied.
+	//	5. Repeat steps 3 and 4 until the tag group or the tag groups of interest are complete.
+	//
+	// Note
+	//	MFC Write Tag Status Update Request Channel = MFC_WrTagUpdate
+	//	MFC Read Tag-Group Status Channel			= MFC_RdTagStat
+	//	MFC Write Tag-Group Query Mask Channel		= MFC_WrTagMask
+
+	// Here we go...
+
+	// 1. Clear any pending tag status update requests	
+	spu_writech(MFC_WrTagUpdate, 0);
+	do {} while (spu_readchcnt(MFC_WrTagUpdate) == 0);
+	spu_readch(MFC_RdTagStat);
+
+	// 2. Enable the tag groups of interest
+	spu_writech(MFC_WrTagMask, dmaTagMask); 
+
+	uint32_t dmaDone = 0;
+
+	do
+	{
+		// 3. Request an immediate tag status update
+		spu_writech(MFC_WrTagUpdate, 0);
+
+		// 4. Perform a read of the MFC Read Tag-Group Status Channel
+		uint32_t tagGroupStat = spu_readch(MFC_RdTagStat);
+
+		dmaDone = (tagGroupStat == dmaTagMask);
+
+	} while (bBlocking && !dmaDone);
+	
+	return !dmaDone;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline int SpuMgr::WriteMailbox(uint32_t val, bool bBlocking /* = true */)
+{
+	uint32_t mboxAvailable;
+
+	do
+	{
+		mboxAvailable = spu_readchcnt(SPU_WrOutMbox);
+	} while (bBlocking && !mboxAvailable);
+
+	if (mboxAvailable)
+	{
+		spu_writech(SPU_WrOutMbox, val);
+	}
+	
+	return !mboxAvailable;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline int SpuMgr::WriteIntrMailbox(uint32_t val, bool bBlocking /* = true */)
+{
+	uint32_t mboxAvailable;
+
+	do
+	{
+		mboxAvailable = spu_readchcnt(SPU_WrOutIntrMbox);
+
+	} while (bBlocking && !mboxAvailable);
+
+	if (mboxAvailable)
+	{
+		spu_writech(SPU_WrOutIntrMbox, val);
+	}
+
+	return !mboxAvailable;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline int SpuMgr::ReadMailbox(uint32_t *pVal, bool bBlocking /* = true */)
+{
+	uint32_t mailAvailable;
+
+	do
+	{
+		mailAvailable = spu_readchcnt(SPU_RdInMbox);
+	} while (bBlocking && !mailAvailable);
+
+	if (mailAvailable)
+		*pVal = spu_readch(SPU_RdInMbox);
+
+	return !mailAvailable;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline uint32_t SpuMgr::ReadDecr(void)
+{
+	return spu_readch(SPU_RdDec);
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline bool SpuMgr::Lock()
+{
+	return cellAtomicCompareAndSwap32( m_lock, m_lockEA, 0, 1 ) == 0;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline void SpuMgr::Unlock()
+{
+	cellAtomicCompareAndSwap32( m_lock, m_lockEA, 1, 0 );
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline bool SpuMgr::MemcpyLock()
+{
+	return cellAtomicCompareAndSwap32( m_memcpyLock, m_memcpyLockEA, 0, 1 ) == 0;
+}
+
+//--------------------------------------------------------------------------------------------------
+//
+//--------------------------------------------------------------------------------------------------
+
+inline void SpuMgr::MemcpyUnlock()
+{
+	cellAtomicCompareAndSwap32( m_memcpyLock, m_memcpyLockEA, 1, 0 );
+}
+
+//--------------------------------------------------------------------------------------------------
+// Externs
+//--------------------------------------------------------------------------------------------------
+
+extern SpuMgr gSpuMgr;
+
+#endif // INCLUDED_SPUMGR_SPU_H
--- a/materialsystem/ps3gcm/cgutils.cpp
+++ b/materialsystem/ps3gcm/cgutils.cpp
@@ -0,0 +1,32 @@
+//================ Copyright (c) 1996-2009 Valve Corporation. All Rights Reserved. =================
+#include "cgutils.h"
+#include "tier0/dbg.h"
+
+struct DatatypeRec_t
+{
+	CGtype type;
+	CGparameterclass parameterClass;
+};
+
+
+static DatatypeRec_t s_datatypeClassname[] = {
+#define CG_DATATYPE_MACRO(name, compiler_name, enum_name, base_enum, nrows, ncols,classname) \
+    { enum_name, classname },
+#include <Cg/cg_datatypes.h>
+#undef CG_DATATYPE_MACRO
+};
+
+
+CGparameterclass vcgGetTypeClass( CGtype type )
+{
+	if( type <= CG_TYPE_START_ENUM || type > CG_TYPE_START_ENUM + sizeof( s_datatypeClassname ) / sizeof( s_datatypeClassname[0] ) )
+	{	
+		return CG_PARAMETERCLASS_UNKNOWN;
+	}
+	else
+    {
+		DatatypeRec_t & rec = s_datatypeClassname[type - CG_TYPE_START_ENUM - 1];
+		Assert( rec.type == type );
+		return rec.parameterClass;
+    }
+}
--- a/materialsystem/ps3gcm/cgutils.h
+++ b/materialsystem/ps3gcm/cgutils.h
--- a/materialsystem/ps3gcm/dxabstract.cpp
+++ b/materialsystem/ps3gcm/dxabstract.cpp
--- a/materialsystem/ps3gcm/dxabstract.h
+++ b/materialsystem/ps3gcm/dxabstract.h
@@ -0,0 +1,565 @@
+//================ Copyright (c) 1996-2009 Valve Corporation. All Rights Reserved. =================
+//
+// LibGcm implementation of DX
+//
+//==================================================================================================
+
+#ifndef DXABSTRACT_H
+#define DXABSTRACT_H
+
+#include "tier0/platform.h"
+#include "tier0/memalloc.h"
+
+#include "utlvector.h"
+
+#include <cell/gcm.h>
+#include <cell/gcm/gcm_method_data.h>
+#include <cell/gcm/gcm_methods.h>
+#include <sysutil/sysutil_sysparam.h>
+
+#include "gcmconfig.h"
+
+#include "dxabstract_def.h"
+#include "gcmtexture.h"
+#include "gcmlabels.h"
+
+#define GCM_ALLOW_TIMESTAMPS 1
+
+#ifdef _CERT
+#define Debugger() ((void)0)
+#else
+#define Debugger() DebuggerBreak()
+#endif
+
+#define PS3GCM_ARTIFICIAL_TEXTURE_HANDLE_INDEX_BACKBUFFER 0
+#define PS3GCM_ARTIFICIAL_TEXTURE_HANDLE_INDEX_DEPTHBUFFER 1
+
+//--------------------------------------------------------------------------------------------------
+// Interfaces
+//--------------------------------------------------------------------------------------------------
+
+struct IDirect3DResource9 : public IUnknown
+{
+	IDirect3DDevice9	*m_device;		// parent device
+	D3DRESOURCETYPE		m_restype;
+	
+	DWORD SetPriority(DWORD PriorityNew);
+};
+
+// for the moment, a "D3D surface" is modeled as a GLM tex, a face, and a mip.
+struct IDirect3DSurface9 : public IDirect3DResource9
+{
+	// no Create method, these are filled in by the various create surface methods.
+
+	HRESULT LockRect(D3DLOCKED_RECT* pLockedRect,CONST RECT* pRect,DWORD Flags);
+	HRESULT UnlockRect();
+	HRESULT GetDesc(D3DSURFACE_DESC *pDesc);
+
+	// only invoke this on depth/stencil surfaces please...
+	// axed HRESULT	ResetDepthStencilSurfaceSize( int Width, int Height );
+
+	D3DSURFACE_DESC	m_desc;	// Layout must be the same as IDirect3DBaseTexture9!
+	CPs3gcmTexture	*m_tex;
+
+	int				m_face;
+	int				m_mip;
+	bool			m_bOwnsTexture;
+
+	~IDirect3DSurface9() { if ( m_bOwnsTexture && m_tex ) m_tex->Release(); }
+};
+
+struct IDirect3DBaseTexture9 : public IDirect3DResource9						// "A Texture.."
+{	
+	D3DSURFACE_DESC		m_descZero;			// desc of top level.
+	CPs3gcmTexture		*m_tex;				// this object owns data
+
+    D3DRESOURCETYPE GetType();
+    DWORD GetLevelCount();
+	HRESULT GetLevelDesc(UINT Level,D3DSURFACE_DESC *pDesc);
+
+	~IDirect3DBaseTexture9() { if ( m_tex ) m_tex->Release(); }
+};
+
+struct IDirect3DTexture9 : public IDirect3DBaseTexture9							// "Texture 2D"
+{	
+	//CUtlVector< IDirect3DSurface9* > m_surfs;
+	
+	IDirect3DSurface9	*m_surfZero;			// surf of top level.  YUK!!
+
+    HRESULT LockRect(UINT Level,D3DLOCKED_RECT* pLockedRect,CONST RECT* pRect,DWORD Flags);
+    HRESULT UnlockRect(UINT Level);
+    HRESULT GetSurfaceLevel(UINT Level,IDirect3DSurface9** ppSurfaceLevel);
+
+	~IDirect3DTexture9() { if ( m_surfZero ) m_surfZero->Release(); }
+};
+
+struct IDirect3DCubeTexture9 : public IDirect3DBaseTexture9						// "Texture Cube Map"
+{
+	IDirect3DSurface9	*m_surfZero[6];			// surfs of top level.  YUK!!
+
+    HRESULT GetCubeMapSurface(D3DCUBEMAP_FACES FaceType,UINT Level,IDirect3DSurface9** ppCubeMapSurface);
+    HRESULT GetLevelDesc(UINT Level,D3DSURFACE_DESC *pDesc);
+
+	~IDirect3DCubeTexture9() { for ( int j = 0; j < 6; ++ j ) if ( m_surfZero[j] ) m_surfZero[j]->Release(); }
+};
+
+struct IDirect3DVolumeTexture9 : public IDirect3DBaseTexture9					// "Texture 3D"
+{
+	IDirect3DSurface9	*m_surfZero;			// surf of top level.  YUK!!
+
+	D3DVOLUME_DESC		m_volDescZero;			// volume desc top level
+
+    HRESULT LockBox(UINT Level,D3DLOCKED_BOX* pLockedVolume,CONST D3DBOX* pBox,DWORD Flags);
+    HRESULT UnlockBox(UINT Level);
+	HRESULT GetLevelDesc( UINT level, D3DVOLUME_DESC *pDesc );
+
+	~IDirect3DVolumeTexture9() { if ( m_surfZero ) m_surfZero->Release(); }
+};
+
+
+struct IDirect3D9 : public IUnknown
+{
+public:
+	UINT	GetAdapterCount();			//cheese: returns 1
+
+    HRESULT GetDeviceCaps				(UINT Adapter,D3DDEVTYPE DeviceType,D3DCAPS9* pCaps);
+    HRESULT GetAdapterIdentifier		(UINT Adapter,DWORD Flags,D3DADAPTER_IDENTIFIER9* pIdentifier);
+    HRESULT CheckDeviceFormat			(UINT Adapter,D3DDEVTYPE DeviceType,D3DFORMAT AdapterFormat,DWORD Usage,D3DRESOURCETYPE RType,D3DFORMAT CheckFormat);
+    UINT	GetAdapterModeCount			(UINT Adapter,D3DFORMAT Format);
+    HRESULT EnumAdapterModes			(UINT Adapter,D3DFORMAT Format,UINT Mode,D3DDISPLAYMODE* pMode);
+    HRESULT CheckDeviceType				(UINT Adapter,D3DDEVTYPE DevType,D3DFORMAT AdapterFormat,D3DFORMAT BackBufferFormat,BOOL bWindowed);
+    HRESULT GetAdapterDisplayMode		(UINT Adapter,D3DDISPLAYMODE* pMode);
+    HRESULT CheckDepthStencilMatch		(UINT Adapter,D3DDEVTYPE DeviceType,D3DFORMAT AdapterFormat,D3DFORMAT RenderTargetFormat,D3DFORMAT DepthStencilFormat);
+    HRESULT CheckDeviceMultiSampleType	(UINT Adapter,D3DDEVTYPE DeviceType,D3DFORMAT SurfaceFormat,BOOL Windowed,D3DMULTISAMPLE_TYPE MultiSampleType,DWORD* pQualityLevels);
+
+    HRESULT CreateDevice				(UINT Adapter,D3DDEVTYPE DeviceType,VD3DHWND hFocusWindow,DWORD BehaviorFlags,D3DPRESENT_PARAMETERS* pPresentationParameters,IDirect3DDevice9** ppReturnedDeviceInterface);
+};
+
+struct IDirect3DSwapChain9 : public IUnknown
+{
+};
+
+
+struct IDirect3DQuery9 : public IUnknown
+{
+public:
+	D3DQUERYTYPE			m_type;		// D3DQUERYTYPE_OCCLUSION or D3DQUERYTYPE_EVENT
+	uint32					m_queryIdx;
+
+	enum Flags_t
+	{
+		kQueryValueMask		=	0x0000FFFF,	// Mask for query value index
+		kQueryFinished		=	0x80000000,	// Query is completed
+		kQueryUninitialized	=	0xFFFFFFFF,	// Query hasn't started
+	};
+
+	struct QueryGlobalStateOcclusion_t
+	{
+		enum { kMaxQueries = GCM_REPORT_QUERY_LAST + 1 - GCM_REPORT_QUERY_FIRST, kGcmQueryBase = GCM_REPORT_QUERY_FIRST };
+		CellGcmReportData volatile *m_Values[kMaxQueries];
+		uint32 m_queryIdx;
+		uint32 PrepareForQuery();
+	};
+	static QueryGlobalStateOcclusion_t s_GlobalStateOcclusion;
+
+	struct QueryGlobalStateFence_t
+	{
+		enum { kMaxQueries = GCM_LABEL_QUERY_LAST + 1 - GCM_LABEL_QUERY_FIRST, kGcmLabelBase = GCM_LABEL_QUERY_FIRST };
+		uint32 volatile *m_Values[kMaxQueries];
+		uint32 m_queryIdx;
+		uint32 PrepareForQuery();
+	};
+	static QueryGlobalStateFence_t s_GlobalStateFence;
+	
+    HRESULT Issue(DWORD dwIssueFlags);
+    HRESULT GetData(void* pData,DWORD dwSize,DWORD dwGetDataFlags);
+};
+
+struct IDirect3DGcmBufferBase : public IUnknown
+{
+public:
+	CPs3gcmBuffer			*m_pBuffer;
+
+	HRESULT Lock(UINT OffsetToLock,UINT SizeToLock,void** ppbData,DWORD Flags);
+	HRESULT Unlock();
+
+	~IDirect3DGcmBufferBase() { if ( m_pBuffer ) m_pBuffer->Release(); }
+};
+
+struct IDirect3DVertexBuffer9 : public IDirect3DGcmBufferBase
+{
+public:
+	D3DVERTEXBUFFER_DESC	m_vtxDesc;		// to satisfy GetDesc
+};
+
+struct IDirect3DIndexBuffer9 : public IDirect3DGcmBufferBase
+{
+public:
+	D3DINDEXBUFFER_DESC		m_idxDesc;		// to satisfy GetDesc
+	
+    HRESULT GetDesc(D3DINDEXBUFFER_DESC *pDesc);
+};
+
+struct IDirect3DGcmProgramBase : public IUnknown
+{
+public:
+	CgBinaryProgram			*m_pProgram;
+	inline CGprogram GetCgProgram() const { return reinterpret_cast< CGprogram >( m_pProgram ); }
+	inline void * GetProgramUCode() const { return (((char*)m_pProgram) + m_pProgram->ucode); }
+
+	~IDirect3DGcmProgramBase() { if ( m_pProgram ) free( m_pProgram ); }
+};
+
+
+// define this to find out how many times we reuse the same shader during a frame
+//#define DEBUG_GCM_VERTEX_SHADER_USAGE
+
+struct IDirect3DVertexShader9 : public IDirect3DGcmProgramBase
+{
+public:
+
+	VertexShader9Data_t m_data;
+
+
+
+	//uint32 m_nIoOffsetStart; // the start of subroutine (IO Offset on RSX) that sets this vertex program
+	
+	~IDirect3DVertexShader9();
+};
+
+
+struct IDirect3DPixelShader9 : public CAlignedNewDelete< 16, IUnknown >
+{
+public:
+	PixelShader9Data_t m_data;
+public:
+	//inline CgBinaryFragmentProgram *GetFragmentProgram() const { return (CgBinaryFragmentProgram *)(((char*)m_pProgram) + m_pProgram->program); }
+	//void ValidateAssumptions( const char * pShaderName );
+	IDirect3DPixelShader9( CgBinaryProgram* prog );
+	~IDirect3DPixelShader9();
+	
+	#ifdef _DEBUG
+	CgBinaryProgram *m_pCgProg;
+	#endif
+};
+
+struct ID3DXMatrixStack : public IUnknown
+{
+public:
+	CUtlVector<D3DMATRIX>	m_stack;
+	int						m_stackTop;	// top of stack is at the highest index, this is that index.  push increases, pop decreases.
+	
+	HRESULT	Create( void );
+	
+    D3DXMATRIX* GetTop();
+	void Push();
+	void Pop();
+	void LoadIdentity();
+	void LoadMatrix( const D3DXMATRIX *pMat );
+	void MultMatrix( const D3DXMATRIX *pMat );
+	void MultMatrixLocal( const D3DXMATRIX *pMat );
+    HRESULT ScaleLocal(FLOAT x, FLOAT y, FLOAT z);
+
+	// Left multiply the current matrix with the computed rotation
+    // matrix, counterclockwise about the given axis with the given angle.
+    // (rotation is about the local origin of the object)
+    HRESULT RotateAxisLocal(CONST D3DXVECTOR3* pV, FLOAT Angle);
+
+	// Left multiply the current matrix with the computed translation
+    // matrix. (transformation is about the local origin of the object)
+    HRESULT TranslateLocal(FLOAT x, FLOAT y, FLOAT z);
+};
+typedef ID3DXMatrixStack* LPD3DXMATRIXSTACK;
+
+struct IDirect3DDevice9Params
+{
+	UINT					m_adapter;
+	D3DDEVTYPE				m_deviceType;
+	VD3DHWND				m_focusWindow;
+	DWORD					m_behaviorFlags;
+	D3DPRESENT_PARAMETERS	m_presentationParameters;
+};
+
+
+struct D3DIndexDesc
+{
+	IDirect3DIndexBuffer9	*m_idxBuffer;
+};
+
+struct IDirect3DDevice9 : public IUnknown
+{
+	// members
+	
+	IDirect3DDevice9Params	m_params;						// mirror of the creation inputs
+
+	// D3D flavor stuff
+	IDirect3DSurface9			*m_rtSurfaces[16];				// current color RT (Render Target) surfaces. [0] is initially == m_defaultColorSurface
+	IDirect3DSurface9			*m_dsSurface;					// current Depth Stencil Render Target surface. can be changed!
+	
+	IDirect3DSurface9			*m_defaultColorSurface;			// default color surface.
+	IDirect3DSurface9			*m_defaultDepthStencilSurface;	// queried by GetDepthStencilSurface.
+	
+	IDirect3DVertexDeclaration9	*m_vertDecl;					// Set by SetVertexDeclaration...
+	//D3DStreamDesc				*m_pVertexStreamSources;	// Set by SetStreamSource..
+	D3DIndexDesc				m_indices;						// Set by SetIndices..
+	
+	IDirect3DVertexShader9		*m_vertexShader;				// Set by SetVertexShader...
+	IDirect3DPixelShader9		*m_pixelShader;					// Set by SetPixelShader...
+
+	#ifdef _DEBUG
+	uint m_nDrawIndexedPrimitives;
+	#endif
+	
+	enum AntiAliasingStatusEnum_t
+	{
+		AA_STATUS_NORMAL,
+		AA_STATUS_PREV_FRAME, // drawing into previous frame, aliased
+		AA_STATUS_DEFERRED    // drawing into deferred queue
+	};
+	// this is used to draw UI into already-mlaa'd-surface (to avoid AA'ing the UI)
+	// when this is on, the default surface to draw should be previous flip surface
+	AntiAliasingStatusEnum_t                         m_nAntiAliasingStatus; 
+	// is in logical zpass? logical zpass may have wider scope than spuGcm.zPass, because logical zpass does not abort for any reason. It begins and ends with API calls. Used to balance Perf Marker Push/Pop
+	bool                         m_isZPass; //
+	bool                         m_isDeferredDrawQueueSurfaceSet;
+
+	// methods
+	
+	// Create call invoked from IDirect3D9
+	HRESULT	Create( IDirect3DDevice9Params *params );
+	
+	//
+	// Basics
+	//
+	HRESULT Reset(D3DPRESENT_PARAMETERS* pPresentationParameters);
+	HRESULT SetViewport(CONST D3DVIEWPORT9* pViewport);
+    HRESULT BeginScene();
+	HRESULT Clear(DWORD Count,CONST D3DRECT* pRects,DWORD Flags,D3DCOLOR Color,float Z,DWORD Stencil);
+    HRESULT EndScene();
+    HRESULT Present(CONST RECT* pSourceRect,CONST RECT* pDestRect,VD3DHWND hDestWindowOverride,CONST RGNDATA* pDirtyRegion);
+
+	// textures
+	HRESULT CreateTexture(UINT Width,UINT Height,UINT Levels,DWORD Usage,D3DFORMAT Format,D3DPOOL Pool,IDirect3DTexture9** ppTexture,VD3DHANDLE* pSharedHandle);
+    HRESULT CreateCubeTexture(UINT EdgeLength,UINT Levels,DWORD Usage,D3DFORMAT Format,D3DPOOL Pool,IDirect3DCubeTexture9** ppCubeTexture,VD3DHANDLE* pSharedHandle);
+    HRESULT CreateVolumeTexture(UINT Width,UINT Height,UINT Depth,UINT Levels,DWORD Usage,D3DFORMAT Format,D3DPOOL Pool,IDirect3DVolumeTexture9** ppVolumeTexture,VD3DHANDLE* pSharedHandle);
+	
+	HRESULT SetTexture(DWORD Stage,IDirect3DBaseTexture9* pTexture);
+    HRESULT GetTexture(DWORD Stage,IDirect3DBaseTexture9** ppTexture);
+	
+	// render targets, color and depthstencil, surfaces, blit
+    HRESULT CreateRenderTarget(UINT Width,UINT Height,D3DFORMAT Format,D3DMULTISAMPLE_TYPE MultiSample,DWORD MultisampleQuality,BOOL Lockable,IDirect3DSurface9** ppSurface,VD3DHANDLE* pSharedHandle);
+    HRESULT SetRenderTarget(DWORD RenderTargetIndex,IDirect3DSurface9* pRenderTarget);
+    HRESULT GetRenderTarget(DWORD RenderTargetIndex,IDirect3DSurface9** ppRenderTarget);
+
+    HRESULT CreateOffscreenPlainSurface(UINT Width,UINT Height,D3DFORMAT Format,D3DPOOL Pool,IDirect3DSurface9** ppSurface,VD3DHANDLE* pSharedHandle);
+
+    HRESULT CreateDepthStencilSurface(UINT Width,UINT Height,D3DFORMAT Format,D3DMULTISAMPLE_TYPE MultiSample,DWORD MultisampleQuality,BOOL Discard,IDirect3DSurface9** ppSurface,VD3DHANDLE* pSharedHandle);
+    HRESULT SetDepthStencilSurface(IDirect3DSurface9* pNewZStencil);
+    HRESULT GetDepthStencilSurface(IDirect3DSurface9** ppZStencilSurface);
+
+	HRESULT GetRenderTargetData(IDirect3DSurface9* pRenderTarget,IDirect3DSurface9* pDestSurface);	// ? is anyone using this ?
+    HRESULT GetFrontBufferData(UINT iSwapChain,IDirect3DSurface9* pDestSurface);
+    HRESULT StretchRect(IDirect3DSurface9* pSourceSurface,CONST RECT* pSourceRect,IDirect3DSurface9* pDestSurface,CONST RECT* pDestRect,D3DTEXTUREFILTERTYPE Filter);
+	
+	// pixel shaders
+    HRESULT CreatePixelShader(CONST DWORD* pFunction,IDirect3DPixelShader9** ppShader, const char *pShaderName = NULL, char *debugLabel = NULL);
+	HRESULT SetPixelShader(IDirect3DPixelShader9* pShader);
+    HRESULT SetPixelShaderConstantF(UINT StartRegister,CONST float* pConstantData,UINT Vector4fCount);
+    HRESULT SetPixelShaderConstantB(UINT StartRegister,CONST BOOL* pConstantData,UINT  BoolCount);
+    HRESULT SetPixelShaderConstantI(UINT StartRegister,CONST int* pConstantData,UINT Vector4iCount);
+
+	// vertex shaders
+    HRESULT CreateVertexShader(CONST DWORD* pFunction,IDirect3DVertexShader9** ppShader, char *debugLabel = NULL);
+    HRESULT SetVertexShader(IDirect3DVertexShader9* pShader);
+    HRESULT SetVertexShaderConstantF(UINT StartRegister,CONST float* pConstantData,UINT Vector4fCount);
+    HRESULT SetVertexShaderConstantB(UINT StartRegister,CONST BOOL* pConstantData,UINT  BoolCount);
+    HRESULT SetVertexShaderConstantI(UINT StartRegister,CONST int* pConstantData,UINT Vector4iCount);
+
+	// vertex buffers
+    HRESULT CreateVertexDeclaration(CONST D3DVERTEXELEMENT9* pVertexElements,IDirect3DVertexDeclaration9** ppDecl);
+	HRESULT SetVertexDeclaration(IDirect3DVertexDeclaration9* pDecl);
+
+    HRESULT SetFVF(DWORD FVF);		// we might not be using these ?
+	HRESULT GetFVF(DWORD* pFVF);
+
+    HRESULT CreateVertexBuffer(UINT Length,DWORD Usage,DWORD FVF,D3DPOOL Pool,IDirect3DVertexBuffer9** ppVertexBuffer,VD3DHANDLE* pSharedHandle);
+    HRESULT SetStreamSource(UINT StreamNumber,IDirect3DVertexBuffer9* pStreamData,UINT OffsetInBytes,UINT Stride);
+	HRESULT SetRawHardwareDataStreams( IDirect3DVertexBuffer9** ppRawHardwareDataStreams );
+
+	// index buffers
+    HRESULT CreateIndexBuffer(UINT Length,DWORD Usage,D3DFORMAT Format,D3DPOOL Pool,IDirect3DIndexBuffer9** ppIndexBuffer,VD3DHANDLE* pSharedHandle);
+    HRESULT SetIndices(IDirect3DIndexBuffer9* pIndexData);
+
+	// State management.
+    HRESULT SetRenderState(D3DRENDERSTATETYPE State,DWORD Value);
+    HRESULT SetSamplerState(DWORD Sampler,D3DSAMPLERSTATETYPE Type,DWORD Value);
+
+	// Draw.
+	HRESULT	ValidateDrawPrimitiveStreams( D3DPRIMITIVETYPE Type, UINT baseVertexIndex, UINT MinVertexIndex, UINT NumVertices, UINT startIndex, UINT primCount );	// validate streams
+    HRESULT DrawPrimitive(D3DPRIMITIVETYPE PrimitiveType,UINT StartVertex,UINT PrimitiveCount);
+	void DrawPrimitiveUP(D3DPRIMITIVETYPE PrimitiveType,UINT PrimitiveCount,CONST void *pVertexStreamZeroData,UINT VertexStreamZeroStride);
+    HRESULT DrawIndexedPrimitive(D3DPRIMITIVETYPE PrimitiveType,INT BaseVertexIndex,UINT MinVertexIndex,UINT NumVertices,UINT startIndex,UINT primCount);
+	HRESULT DrawIndexedPrimitiveUP(D3DPRIMITIVETYPE PrimitiveType,UINT MinVertexIndex,UINT NumVertices,UINT PrimitiveCount,CONST void* pIndexData,D3DFORMAT IndexDataFormat,CONST void* pVertexStreamZeroData,UINT VertexStreamZeroStride);
+
+	// misc
+    BOOL ShowCursor(BOOL bShow);
+    HRESULT ValidateDevice(DWORD* pNumPasses);
+    HRESULT SetMaterial(CONST D3DMATERIAL9* pMaterial);
+    HRESULT LightEnable(DWORD Index,BOOL Enable);
+    HRESULT SetScissorRect(CONST RECT* pRect);
+	HRESULT CreateQuery(D3DQUERYTYPE Type,IDirect3DQuery9** ppQuery);
+    HRESULT GetDeviceCaps(D3DCAPS9* pCaps);
+    HRESULT TestCooperativeLevel();
+    HRESULT EvictManagedResources();
+    HRESULT SetLight(DWORD Index,CONST D3DLIGHT9*);
+    void SetGammaRamp(UINT iSwapChain,DWORD Flags,CONST D3DGAMMARAMP* pRamp);
+
+	// Talk to JasonM about this one. It's tricky in GL.
+    HRESULT SetClipPlane(DWORD Index,CONST float* pPlane);
+
+	ULONG __stdcall Release();
+
+	// Xbox ZPass analogue
+	void BeginZPass( DWORD Flags );
+	void SetPredication( DWORD PredicationMask );
+	HRESULT EndZPass();
+	
+//	void ReloadZcullMemory( int nStencilRef );
+	void StartRenderingIntoPreviousFramebuffer();
+	void AntiAliasingHint( int nHint );
+
+	//
+	//
+	// **** FIXED FUNCTION STUFF - None of this stuff needs support in GL.
+	//
+	//
+    HRESULT SetTransform(D3DTRANSFORMSTATETYPE State,CONST D3DMATRIX* pMatrix);
+    HRESULT SetTextureStageState(DWORD Stage,D3DTEXTURESTAGESTATETYPE Type,DWORD Value);	
+
+#ifdef _PS3
+	void GetGPUMemoryStats( GPUMemoryStats &stats ) { return ::GetGPUMemoryStats( stats ); }
+	
+	void FlushVertexCache();
+	void FlushTextureCache();
+
+	// Allocate storage for a texture's bits (if D3DUSAGE_TEXTURE_NOD3DMEMORY was used to defer allocation on creation)
+	bool AllocateTextureStorage( IDirect3DBaseTexture9 *pTexture );
+
+protected:
+	// Flushing changes to GL
+	void                        SetVertexStreamSource( uint i, IDirect3DVertexBuffer9* pStreamData,UINT OffsetInBytes,UINT Stride );
+	void Ps3Helper_ResetSurfaceToKnownDefaultState();
+	void Ps3Helper_UpdateSurface( int idx );
+	friend void DxDeviceForceUpdateRenderTarget( );
+#endif
+};
+
+struct ID3DXInclude
+{
+    virtual HRESULT Open(D3DXINCLUDE_TYPE IncludeType, LPCSTR pFileName, LPCVOID pParentData, LPCVOID *ppData, UINT *pBytes) = 0;
+    virtual HRESULT Close(LPCVOID pData) = 0;
+};
+typedef ID3DXInclude* LPD3DXINCLUDE;
+
+
+struct ID3DXBuffer : public IUnknown
+{
+    void* GetBufferPointer();
+    DWORD GetBufferSize();
+};
+
+typedef ID3DXBuffer* LPD3DXBUFFER;
+
+class ID3DXConstantTable : public IUnknown
+{
+};
+typedef ID3DXConstantTable* LPD3DXCONSTANTTABLE;
+
+// ------------------------------------------------------------------------------------------------------------------------------ //
+// D3DX stuff.
+// ------------------------------------------------------------------------------------------------------------------------------ //
+
+const char* D3DXGetPixelShaderProfile( IDirect3DDevice9 *pDevice );
+
+
+D3DXMATRIX* D3DXMatrixMultiply( D3DXMATRIX *pOut, CONST D3DXMATRIX *pM1, CONST D3DXMATRIX *pM2 );
+D3DXVECTOR3* D3DXVec3TransformCoord( D3DXVECTOR3 *pOut, CONST D3DXVECTOR3 *pV, CONST D3DXMATRIX *pM );
+
+HRESULT D3DXCreateMatrixStack( DWORD Flags, LPD3DXMATRIXSTACK* ppStack);
+void D3DXMatrixIdentity( D3DXMATRIX * );
+
+D3DXINLINE D3DXVECTOR3* D3DXVec3Subtract( D3DXVECTOR3 *pOut, CONST D3DXVECTOR3 *pV1, CONST D3DXVECTOR3 *pV2 )
+{
+    pOut->x = pV1->x - pV2->x;
+    pOut->y = pV1->y - pV2->y;
+    pOut->z = pV1->z - pV2->z;
+    return pOut;
+}
+
+D3DXINLINE D3DXVECTOR3* D3DXVec3Cross( D3DXVECTOR3 *pOut, CONST D3DXVECTOR3 *pV1, CONST D3DXVECTOR3 *pV2 )
+{
+    D3DXVECTOR3 v;
+
+    v.x = pV1->y * pV2->z - pV1->z * pV2->y;
+    v.y = pV1->z * pV2->x - pV1->x * pV2->z;
+    v.z = pV1->x * pV2->y - pV1->y * pV2->x;
+
+    *pOut = v;
+    return pOut;
+}
+
+D3DXINLINE FLOAT D3DXVec3Dot( CONST D3DXVECTOR3 *pV1, CONST D3DXVECTOR3 *pV2 )
+{
+    return pV1->x * pV2->x + pV1->y * pV2->y + pV1->z * pV2->z;
+}
+
+D3DXMATRIX* D3DXMatrixInverse( D3DXMATRIX *pOut, FLOAT *pDeterminant, CONST D3DXMATRIX *pM );
+
+D3DXMATRIX* D3DXMatrixTranspose( D3DXMATRIX *pOut, CONST D3DXMATRIX *pM );
+
+D3DXPLANE* D3DXPlaneNormalize( D3DXPLANE *pOut, CONST D3DXPLANE *pP);
+
+D3DXVECTOR4* D3DXVec4Transform( D3DXVECTOR4 *pOut, CONST D3DXVECTOR4 *pV, CONST D3DXMATRIX *pM );
+
+
+D3DXVECTOR4* D3DXVec4Normalize( D3DXVECTOR4 *pOut, CONST D3DXVECTOR4 *pV );
+
+D3DXMATRIX* D3DXMatrixTranslation( D3DXMATRIX *pOut, FLOAT x, FLOAT y, FLOAT z );
+
+// Build an ortho projection matrix. (right-handed)
+D3DXMATRIX* D3DXMatrixOrthoOffCenterRH( D3DXMATRIX *pOut, FLOAT l, FLOAT r, FLOAT b, FLOAT t, FLOAT zn,FLOAT zf );
+
+D3DXMATRIX* D3DXMatrixPerspectiveRH( D3DXMATRIX *pOut, FLOAT w, FLOAT h, FLOAT zn, FLOAT zf );
+
+D3DXMATRIX* D3DXMatrixPerspectiveOffCenterRH( D3DXMATRIX *pOut, FLOAT l, FLOAT r, FLOAT b, FLOAT t, FLOAT zn, FLOAT zf );
+
+// Transform a plane by a matrix.  The vector (a,b,c) must be normal.
+// M should be the inverse transpose of the transformation desired.
+D3DXPLANE* D3DXPlaneTransform( D3DXPLANE *pOut, CONST D3DXPLANE *pP, CONST D3DXMATRIX *pM );
+
+IDirect3D9 *Direct3DCreate9(UINT SDKVersion);
+
+void D3DPERF_SetOptions( DWORD dwOptions );
+
+HRESULT D3DXCompileShader(
+        LPCSTR                          pSrcData,
+        UINT                            SrcDataLen,
+        CONST D3DXMACRO*                pDefines,
+        LPD3DXINCLUDE                   pInclude,
+        LPCSTR                          pFunctionName,
+        LPCSTR                          pProfile,
+        DWORD                           Flags,
+        LPD3DXBUFFER*                   ppShader,
+        LPD3DXBUFFER*                   ppErrorMsgs,
+        LPD3DXCONSTANTTABLE*            ppConstantTable);
+
+
+// fake D3D usage constant for SRGB tex creation
+#define D3DUSAGE_TEXTURE_SRGB			(0x80000000L)
+// fake D3D usage constant for deferred tex bits allocation
+#define D3DUSAGE_TEXTURE_NOD3DMEMORY	(0x40000000L)
+
+extern bool g_bDxMicroProfile;
+
+#endif // DXABSTRACT_H
+
--- a/materialsystem/ps3gcm/dxabstract_def.h
+++ b/materialsystem/ps3gcm/dxabstract_def.h
--- a/materialsystem/ps3gcm/fpcpatcher_spu.cpp
+++ b/materialsystem/ps3gcm/fpcpatcher_spu.cpp
@@ -0,0 +1,611 @@
+//========= Copyright <20> Valve Corporation, All rights reserved. ====//
+#include "tier0/platform.h"
+#ifdef _PS3
+#include "dxabstract.h"
+
+#include <sys/memory.h>
+#include "ps3/spugcm_shared.h"
+#include "fpcpatcher_spu.h"
+#include "cg/cg.h"
+#include "cg/cgBinary.h"
+#include "vjobs_interface.h"
+#include "tier0/hardware_clock_fast.h"
+#include "vjobs/fpcpatch_shared.h"
+#include "vjobs/root.h"
+#include "ps3/vjobutils.h"
+#include "tier0/microprofiler.h"
+#include "ps3/ps3_gcm_config.h"
+#include "spugcm.h"
+
+enum
+{
+	PROFILE_SCE_VP_RSX = 7003,
+	PROFILE_SCE_FP_RSX = 7004
+};
+
+#define GCM_MUST_SUCCEED( FUNC, ... ) do { int nError = FUNC(__VA_ARGS__); if( nError != CELL_OK ) { Error( "Error 0x%X in " #FUNC ", %s:%d\n", nError, __FILE__, __LINE__ ); } } while( 0 )
+DEFINE_LOGGING_CHANNEL_NO_TAGS( LOG_VJOBS, "VJOBS" );
+
+CFragmentProgramConstantPatcher_SPU g_pixelShaderPatcher;	// Patches pixel shader constants
+
+
+CMicroProfiler g_mpBindProgram, g_mpFpcPatch2;
+// debug only
+CFragmentProgramConstantPatcher_SPU::CFragmentProgramConstantPatcher_SPU()
+{
+	m_pBuffer = m_pBufferEnd = NULL;
+	m_nIoOffsetDelta = 0; // m_pBuffer + m_nIoOffsetDelta == IO offset usable by RSX
+
+	m_pPutFragmentProgram = NULL;
+#ifdef DEBUG_FPC_PATCHER
+	m_bSync = ( CommandLine()->FindParm( "-fpcpsync" ) != 0 );
+#endif
+}
+
+
+void CFragmentProgramConstantPatcher_SPU::InitLocal( void *pBuffer, uint nSize )
+{
+	m_nFpcPatchCounter				= 0;
+	m_nFpcPatchCounterOfLastSyncJob = 0;
+
+	//cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL2 );
+	const uint nOverfetchGuard = 1024; // RSX front end prefetches up to 4k, but 1k is ( should be ) enough to avoid overfetch crashes
+	const uint nStateBufferQwords = 1 << 12; // make space for at least 8 full batches of constants...
+	uint nPatchStateBufferSize = ( sizeof( job_fpcpatch::FpcPatchState_t ) + sizeof( fltx4 ) * nStateBufferQwords );
+	uint32 nBufferIoOffset;
+	
+	m_bFpcPatchOnPpu = ( 0 != CommandLine()->FindParm( "-fpcpatchonppu" ) );
+#ifdef DEBUG_FPC_PATCHER
+	m_bTestAlwaysStateSync = ( 0 != CommandLine()->FindParm( "-fpcpstatesync" ) );
+#endif
+	m_bEnableSPU = true;
+	m_nFpcPatchSyncMask = 0;
+
+	// use this passed buffer (probably from local memory) for the patched stuff
+	m_pBuffer = ( uint32* ) pBuffer;
+	m_pBufferEnd = ( uint32* ) ( uintp( pBuffer ) + nSize );
+	m_nBufferLocation = CELL_GCM_LOCATION_LOCAL;
+	m_isBufferPassedIn = true;
+	m_state.Init( ( job_fpcpatch::FpcPatchState_t* )MemAlloc_AllocAligned( nPatchStateBufferSize, 128 ), nStateBufferQwords );
+	GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffset );
+
+#ifdef DBGFLAG_ASSERT
+	uint32 nBufferIoOffsetCheck;
+	GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pBuffer, &nBufferIoOffsetCheck );
+	Assert( nBufferIoOffsetCheck == nBufferIoOffset );
+	
+	Assert( !( nBufferIoOffsetCheck & 0x7F ) );
+	
+	for( uint nOffset = 0; nOffset < nSize; nOffset += 128 )
+	{
+		GCM_MUST_SUCCEED( cellGcmAddressToOffset, ((uint8*)m_pBuffer) + nOffset, &nBufferIoOffsetCheck );
+		Assert( nBufferIoOffsetCheck == nBufferIoOffset + nOffset );
+	}
+#endif
+
+	m_nIoOffsetDelta = nBufferIoOffset - uintp( m_pBuffer );
+
+#ifdef DEBUG_FPC_PATCHER
+	m_pSyncState = ( fltx4* ) MemAlloc_AllocAligned( sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT, 16 );
+	V_memset( m_pSyncState, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
+	V_memset( m_state.m_pSharedState->m_reg, 0xCD, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT );
+#endif
+	ResetPut();
+	//cellGcmSetDebugOutputLevel( CELL_GCM_DEBUG_LEVEL0 );
+}
+
+
+void CFragmentProgramConstantPatcher_SPU::Shutdown()
+{
+}
+
+
+
+void CFragmentProgramConstantPatcher_SPU::ResetPut()
+{
+	m_pPutFragmentProgram = m_pBufferEnd; // reserved word for the count of constants to set
+}
+
+CFragmentProgramConstantPatcher_SPU::~CFragmentProgramConstantPatcher_SPU()
+{
+	if( m_isBufferPassedIn )
+	{
+		MemAlloc_FreeAligned( m_state.m_pSharedState );
+	}
+	else
+	{
+		sys_memory_free( ( sys_addr_t )m_pBuffer );
+	}
+#ifdef DEBUG_FPC_PATCHER
+	MemAlloc_FreeAligned( m_pSyncState );
+#endif
+}
+
+
+void CFragmentProgramConstantPatcher_SPU::BeginScene()
+{
+	m_nFpcPatchCounterAtBeginScene = m_nFpcPatchCounter;
+	// we shouldn't have in-flight SPU jobs by now.. should we?
+	Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - m_state.m_pSharedState->m_nStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
+}
+
+
+void CFragmentProgramConstantPatcher_SPU::EndScene()
+{
+	#if ENABLE_MICRO_PROFILER > 0
+	uint nPatchCounter = m_nFpcPatchCounter - m_nFpcPatchCounterAtBeginScene;
+	extern bool g_bDxMicroProfile;
+	if( g_bDxMicroProfile && nPatchCounter )
+	{
+		g_mpBindProgram.PrintAndReset( "[BindProgram] " );
+		g_mpFpcPatch2  .PrintAndReset( "[FpcPatch2]   " );
+	}
+	#endif
+}
+
+job_fpcpatch2::FpHeader_t g_nullFpHeader = {0,0,0,0};
+
+// semantics should match cgGLSetFragmentRegisterBlock()
+void CFragmentProgramConstantPatcher_SPU::SetFragmentRegisterBlock( uint nStartRegister, uint nVector4fCount, const float * pConstantData )
+{
+#ifndef _CERT
+	if ( nStartRegister >= job_fpcpatch::MAX_VIRTUAL_CONST_COUNT || nStartRegister + nVector4fCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
+		Error( "Invalid Fragment Register Block Range %u..%u\n", nStartRegister, nStartRegister + nVector4fCount );
+#endif
+
+#ifdef DEBUG_FPC_PATCHER
+	if( m_bSync )
+	{	
+		fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
+		m_state.GetSyncState( reg );
+		Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
+	}
+	uint nEnd = m_state.m_nEndOfJournalIdx;
+#endif
+
+	// we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
+	// this leaves the max constant buffer size 4 * 16kb + 16 bytes
+	const uint nMaxUploadRangeBeforeStateSync = ( 32 * 1024 ) / sizeof( fltx4 );
+	uint numUploadRangeQwords = m_state.m_nEndOfJournalIdx - g_spuGcmShared.m_nFpcpStartRangesAfterLastSync;
+	///////////////////////////////////////////////////////////////////////////
+	//
+	//   PREPATCH MUST BE DONE IN (CTXFLUSH OR) DRAW JOB FROM NOW ON!!! g_spuGcmShared.m_nFpcpStartRangesAfterLastSync IS SYNCHRONOUS AND CORRECT THERE
+	//
+	//////////////////////////////////////////////////////////////////////////
+	
+	
+/*
+	bool bPrePatch = nVector4fCount + 1 + numUploadRangeQwords > nMaxUploadRangeBeforeStateSync;
+	if( bPrePatch )
+	{
+		// force state sync now
+		if( g_spuGcmShared.m_enableStallWarnings )
+		{
+			Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Forced to set state on PPU, %u vectors, %u qwords in history. This is slow fallback path.\n", nVector4fCount, numUploadRangeQwords );
+		}
+		FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
+	}
+
+*/
+	if( uint nAttempts = m_state.AddRange( nStartRegister, nVector4fCount, pConstantData ) )
+	{
+		if( g_spuGcmShared.m_enableStallWarnings )
+		{
+			Warning( "PPU-SPU Wait for RSX. SetFragmentRegisterBlock: Stall, %d spins. Waiting for more memory; %d qwords, %d jobs buffered up\n", nAttempts, m_state.m_nEndOfJournalIdx - m_state.m_pSharedState->m_nStartRanges, g_spuGcmShared.m_nFpcPatchCounter - m_state.m_pSharedState->m_nThisStatePatchCounter );
+		}
+	}
+	
+#ifdef DEBUG_FPC_PATCHER
+	if( m_bTestAlwaysStateSync && !bPrePatch )
+	{
+		FpcPatch2( &g_nullFpHeader, sizeof( g_nullFpHeader ), NULL, NULL );
+	}
+
+	V_memcpy( m_pSyncState + nStartRegister, pConstantData, nVector4fCount * sizeof( fltx4 ) );
+	if( m_bSync )
+	{	
+		fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
+		m_state.GetSyncState( reg );
+		Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
+	}
+#endif
+}
+
+//volatile int g_nDebugStage = 0;
+
+//
+// Match the semantics of cgGLBindProgram()
+// There are 2 formats of fragment shaders, see SDK docs "2. 2 Cg Compiler Options" and
+//   in Cg Compiler User's Guide:
+//      "7. 2 NV Binary Shader Format (VPO and FPO)"
+//      "7. 4 Cgb File Format Specification"
+//
+
+void CFragmentProgramConstantPatcher_SPU::BindProgram( const struct IDirect3DPixelShader9 * psh )
+{
+	MICRO_PROFILE( g_mpBindProgram );
+
+	const job_fpcpatch2::FpHeader_t * prog = psh->m_data.m_eaFp;
+	uint32 nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) + m_nIoOffsetDelta;
+
+	g_spuGcmShared.m_fpcpRing.UnlockRsxMemoryForSpu();
+	m_pPutFragmentProgram = ( uint32* )g_spuGcmShared.m_fpcpRing.LockRsxMemoryForSpu( &g_spuGcmShared.m_fpcpJobChain, prog->m_nUcodeSize );
+	nFragmentProgramOffset = uintp( m_pPutFragmentProgram ) - uintp( g_ps3gcmGlobalState.m_pLocalBaseAddress );
+	if( !IsCert() && nFragmentProgramOffset >= g_ps3gcmGlobalState.m_nLocalSize )
+	{
+		Error( "Fragment program Ucode buffer offset 0x%X is at unexpected address not in local memory\n", nFragmentProgramOffset );
+	}
+
+	if ( !IsCert() && ( m_pPutFragmentProgram < m_pBuffer || m_pPutFragmentProgram >= m_pBufferEnd ) )
+	{
+		Error( "Fragment Program UCode buffer overflow.\n" );
+	}
+	
+#ifdef DEBUG_FPC_PATCHER
+	if( m_bSync )
+	{	
+		fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
+		m_state.GetSyncState( reg );
+		Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
+	}
+#endif
+
+	uint nTexControls = prog->m_nTexControls;
+
+	// set jump to self
+	GCM_CTX_RESERVE( 7 + 2 * nTexControls );
+	uint32 * pJts = NULL;
+
+	FpcPatch2( prog, psh->m_data.m_nFpDmaSize, m_pPutFragmentProgram, pJts );
+
+	CELL_GCM_METHOD_SET_SHADER_CONTROL( GCM_CTX->current, prog->m_nShaderControl0 ); // +2
+	CELL_GCM_METHOD_SET_SHADER_PROGRAM( GCM_CTX->current, m_nBufferLocation + 1, ( nFragmentProgramOffset & 0x1fffffff ) ); // +2
+	CELL_GCM_METHOD_SET_VERTEX_ATTRIB_OUTPUT_MASK( GCM_CTX->current, psh->m_data.m_attributeInputMask | 0x20 );  // +2
+	V_memcpy( GCM_CTX->current, prog->GetTexControls(), nTexControls * sizeof( uint32 ) * 2 );
+	GCM_CTX->current += 2 * nTexControls;
+
+	#ifdef DEBUG_FPC_PATCHER
+	if( m_bSync )
+	{
+		g_ps3gcmGlobalState.CmdBufferFlush( CPs3gcmGlobalState::kFlushForcefully );
+		while ( *( volatile uint32* )pJts )
+		{
+			sys_timer_usleep( 50 );// wait for nop
+		}
+		#ifdef DEBUG_FPC_PATCHER
+		{	
+			fltx4 reg[job_fpcpatch::MAX_VIRTUAL_CONST_COUNT];
+			m_state.GetSyncState( reg );
+			Assert( !V_memcmp( m_pSyncState, reg, sizeof( fltx4 ) * job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
+		}
+		ValidatePatchedProgram( psh->m_pCgProg, m_pPutFragmentProgram );
+		uint32 nFragmentProgramOffsetCheck;
+		GCM_MUST_SUCCEED( cellGcmAddressToOffset, m_pPutFragmentProgram, &nFragmentProgramOffsetCheck );
+		Assert( nFragmentProgramOffsetCheck == nFragmentProgramOffset );
+		#endif
+		
+		g_ps3gcmGlobalState.CmdBufferFinish();
+	}
+	#endif
+	m_nFpcPatchCounter++;
+}
+
+
+
+
+
+uint g_nFpcPatch2JobExtraFlags = 0; // set this to 2 and SPU will break
+
+
+static int s_nFpcPatch2Calls = 0;
+
+void CFragmentProgramConstantPatcher_SPU::FpcPatch2( const job_fpcpatch2::FpHeader_t * prog, uint nFpDmaSize, void *pPatchedProgram, uint32 * pJts )
+{
+	MICRO_PROFILE( g_mpFpcPatch2 );
+
+#ifdef VJOBS_ON_SPURS
+	VjobChain3 &jobChain = g_spuGcm.m_jobSink;
+	uint32 nUCodeSize = prog->m_nUcodeSize;
+	CellSpursJob128 * pJob = g_spuGcm.m_jobPool128.Alloc( *g_spuGcm.m_pRoot->m_pFpcPatch2 );
+	Assert( pJob->header.sizeDmaList == 0 && pJob->header.sizeInOrInOut == 0 ); // the default MUST always be 1
+	pJob->header.useInOutBuffer = 1;
+	
+	CDmaListConstructor dmaConstructor( pJob->workArea.dmaList );
+
+	dmaConstructor.AddInputDma( nFpDmaSize, prog );
+	dmaConstructor.AddInputDma( sizeof( *m_state.m_pSharedState ), ( void* )m_state.m_pSharedState );
+
+	// the g_spuGcmShared.m_nFpcpStartRangesAfterLastSync runs ahead of m_state.m_pSharedState->m_nStartRanges , because it's a PREDICTED
+	// start of range. It'll be absolutely in-sync with m_state.m_pSharedState->m_nStartRanges if we run SPUs synchronously
+	#ifdef DBGFLAG_ASSERT
+	uint nSharedStateStartRanges = m_state.m_pSharedState->m_nStartRanges; 
+	#endif
+	// NOTE: if the asserts below fire, it may be due to invalid value in nSharedStateStartRanges because SPU DMAs stuff right down to m_state.m_pSharedState and it's changing while this code executes
+	Assert( uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) <= m_state.m_pSharedState->m_nBufferMask + 1 );
+	Assert( uint( g_spuGcmShared.m_nFpcpStartRangesAfterLastSync - nSharedStateStartRanges ) <= uint( m_state.m_nEndOfJournalIdx - nSharedStateStartRanges ) );
+
+	uint nStartOfJournal = /*nSharedStateStartRanges*/g_spuGcmShared.m_nFpcpStartRangesAfterLastSync, nBufferMask = m_state.m_pSharedState->m_nBufferMask;
+	
+	// we have 4 DMA elements ( 2..6 ) to fit the constant buffer; the 1st element may have to be as small as 16 bytes.
+	// this leaves the max constant buffer size 4 * 16kb + 16 bytes
+	
+	const uint numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
+	Assert( numRangeQwords <= nBufferMask + 1 );
+	if ( numRangeQwords != 0 )
+	{
+		uint nEndOfSpan0 = ( nStartOfJournal + nBufferMask + 1 ) & ~nBufferMask;
+		if ( ( signed int )( nEndOfSpan0 - m_state.m_nEndOfJournalIdx ) >= 0 )
+		{
+			//numRangeQwords = ( m_state.m_nEndOfJournalIdx - nStartOfJournal );
+			dmaConstructor.AddInputDmaLarge( ( numRangeQwords ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
+		}
+		else
+		{
+			//numRangeQwords = nFirstRange + nSecondRange ;
+			dmaConstructor.AddInputDmaLarge( ( nEndOfSpan0 - nStartOfJournal ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() + ( nStartOfJournal & nBufferMask ) );
+			dmaConstructor.AddInputDmaLarge( ( m_state.m_nEndOfJournalIdx - nEndOfSpan0 ) * sizeof( fltx4 ), m_state.m_pSharedState->GetBufferStart() );
+		}
+	}
+	else
+	{
+		dmaConstructor.AddSizeInOrInOut( 16 ); // we need at least 16 bytes in the ranges area for temporary storage
+	}
+
+	dmaConstructor.FinishIoBuffer( &pJob->header );
+	if( pJob->header.sizeDmaList > 7 * sizeof( uint64 ) )
+	{
+		Error( "FpcPatch2: DMA list size out of range (%d). job_fpcpatch2 parameters won't fit. numRangeQwords = %d\n", pJob->header.sizeDmaList, numRangeQwords );
+	}
+	
+	
+	// IMPORTANT: make it always synchronous , in case we don't have the target to patch. The only reason for this job to exist is to make it synchronous
+	// Also, if the range is large, still make it synchronous, to avoid subsequent jobs doing a lot of computations in vein
+	uint nAsync = !pPatchedProgram || numRangeQwords >= 1024 ? 0 : ( ( m_nFpcPatchCounter ) & m_nFpcPatchSyncMask ) ;
+
+	dmaConstructor[7][0] = m_nFpcPatchCounterOfLastSyncJob;
+	dmaConstructor[7][1] = m_nFpcPatchCounter;
+	dmaConstructor[8][0] = ( uint32 ) pPatchedProgram;
+	dmaConstructor[8][1] = uintp( pJts ); // the SPU->RSX dma element; may be NULL
+	dmaConstructor[9][0] = m_state.m_nEndOfJournalIdx;
+	dmaConstructor[9][1] = ( uint32 ) nStartOfJournal;
+	if( !IsCert() )
+	{
+		pJob->header.jobType |= CELL_SPURS_JOB_TYPE_MEMORY_CHECK;
+	}
+
+	dmaConstructor[8][0] |= g_nFpcPatch2JobExtraFlags;
+	if ( !nAsync )
+	{
+		dmaConstructor[8][0] |= job_fpcpatch::FLAG_PUT_STATE;
+		m_nFpcPatchCounterOfLastSyncJob = m_nFpcPatchCounter;
+		pJob->header.jobType |= CELL_SPURS_JOB_TYPE_STALL_SUCCESSOR;
+		g_spuGcmShared.m_nFpcpStartRangesAfterLastSync = m_state.m_nEndOfJournalIdx;
+	}
+
+#ifdef DBGFLAG_ASSERT
+	int nError = cellSpursCheckJob( ( const CellSpursJob256* )pJob, sizeof( *pJob ), 256 );
+	static int s_nJobErrors = 0;
+	if( CELL_OK != nError )
+	{
+		++s_nJobErrors;
+	}
+#endif
+
+	if ( !nAsync )
+	{
+		jobChain.PushSyncJobSync( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
+	}
+	else
+	{
+		jobChain.Push( CELL_SPURS_JOB_COMMAND_JOB( pJob ) );
+	}
+	
+#ifdef DEBUG_FPC_PATCHER
+	if( m_bSync )
+	{
+		if( pJts )
+		{
+			volatile uint32 * pJts2 = pJts;
+			while( *pJts2 )
+				continue;
+		}
+		
+		volatile uint64_t * pEaJob = &pJob->header.eaBinary;
+		while( * pEaJob )
+			continue;
+	}
+#endif	
+	s_nFpcPatch2Calls++;
+	
+#endif
+}
+
+
+#ifdef DEBUG_FPC_PATCHER
+extern void PatchUcodeConstSwap( uint32 * pDestination, const uint32 * pSource, int nLength );
+extern uint fspatchGetLength( CGtype nType );
+
+uint32 g_nConstLengthCounter[5] = { 0, 0, 0, 0, 0 };
+
+void CFragmentProgramConstantPatcher_SPU::ValidatePatchedProgram( const CgBinaryProgram * prog, void * pPatchedUcode )
+{
+	Assert( prog->profile == PROFILE_SCE_FP_RSX && prog->binaryFormatRevision == CG_BINARY_FORMAT_REVISION );
+	uint32 nUCodeSize = prog->ucodeSize;
+	void * pUcode = stackalloc( nUCodeSize );
+	void * pSourceUcode =  ( ( uint8* ) prog ) + prog->ucode;
+	V_memcpy( pUcode, ( ( uint8* ) prog ) + prog->ucode, nUCodeSize );
+
+	CgBinaryParameter * pParameters = ( CgBinaryParameter * )( uintp( prog ) + prog->parameterArray ) ;
+
+	uint32 * pPatchDestination = NULL;
+	Assert( cellGcmCgGetCountParameter( ( CGprogram ) prog ) == prog->parameterCount );
+	for ( int nPar = 0; nPar < prog->parameterCount; ++nPar )
+	{
+		CgBinaryParameter * pPar = pParameters + nPar;
+		Assert( pPar == ( CgBinaryParameter * ) cellGcmCgGetIndexParameter( ( CGprogram ) prog, nPar ) );
+
+#ifdef DBGFLAG_ASSERT
+		const char * pLeafName = ( const char * )( uintp( prog ) + pPar->name );
+		( void )pLeafName;
+		uint32 * pDefault = pPar->defaultValue ? ( uint32* )( uintp( prog ) + pPar->defaultValue ) : NULL ;
+#endif
+
+		if ( pPar->embeddedConst )
+		{
+			Assert( pPar->res == CG_C && pPar->var == CG_UNIFORM ); // this MUST be a uniform constant.. at least I think that's the only kind we need to patch
+			const CgBinaryEmbeddedConstant * pEmbedded = ( const CgBinaryEmbeddedConstant* )( uintp( prog ) + pPar->embeddedConst );
+			int nLength = fspatchGetLength( pPar->type );
+			g_nConstLengthCounter[nLength] ++;
+			for ( uint nEm = 0; nEm < pEmbedded->ucodeCount; ++ nEm )
+			{
+				uint ucodeOffset = pEmbedded->ucodeOffset[nEm]; // is this the offset from prog structure start?
+				Assert( ucodeOffset < nUCodeSize - 4 );
+#ifdef DBGFLAG_ASSERT
+				Assert( cellGcmCgGetEmbeddedConstantOffset( ( CGprogram ) prog, ( CGparameter ) pPar, nEm ) == ucodeOffset );
+				const float * pDefaultCheck = cellGcmCgGetParameterValues( ( CGprogram ) prog, ( CGparameter ) pPar );
+				Assert( pDefault == ( uint32* ) pDefaultCheck );
+				uint32 * pUcodeEmConst = ( uint32* )( uintp( pSourceUcode ) + ucodeOffset );
+				Assert( !pDefault || !V_memcmp( pDefault, pUcodeEmConst, nLength * 4 ) );
+#endif
+
+				pPatchDestination = ( uint32* )( uintp( pUcode ) + ucodeOffset );
+				uint32 * pPatchedCheck = ( uint32* )( uintp( pPatchedUcode ) + ucodeOffset );
+				PatchUcodeConstSwap( pPatchDestination, ( uint32* ) & ( m_pSyncState[pPar->resIndex] ), nLength );
+				Assert( !V_memcmp( pPatchDestination, pPatchedCheck, nLength * 4 ) );
+			}
+		}
+	}
+
+	Assert( !V_memcmp( pPatchedUcode, pUcode, nUCodeSize ) );
+}
+#endif
+
+
+void FpcPatchState::Init( job_fpcpatch::FpcPatchState_t * pSharedState, uint32 nBufferQwords )
+{
+#ifdef _DEBUG
+	//m_nRangesAdded = 0;
+#endif
+	pSharedState->m_nBufferMask   = m_nBufferMask      = nBufferQwords - 1;
+	pSharedState->m_nStartRanges  = m_nEndOfJournalIdx = IsCert() ? 0 : nBufferQwords - 128;
+	pSharedState->m_eaThis        = m_pSharedState     = pSharedState;
+	pSharedState->m_nThisStatePatchCounter = 0;
+	pSharedState->m_nDebuggerBreak = 0;
+}
+
+
+
+
+void FpcPatchState::GetSyncState( fltx4 * pRegisters )
+{
+	V_memcpy( pRegisters, m_pSharedState->m_reg, job_fpcpatch:: MAX_VIRTUAL_CONST_COUNT * sizeof( fltx4 ) );
+	for( uint nJournalIdx = m_pSharedState->m_nStartRanges; nJournalIdx < m_nEndOfJournalIdx ; )
+	{
+		job_fpcpatch:: ConstRangeHeader_t & range = ((job_fpcpatch::ConstRangeHeader_t*)m_pSharedState->GetBufferStart())[ nJournalIdx & m_pSharedState->m_nBufferMask ];
+		nJournalIdx++;
+		for( uint nConstIdx = 0 ; nConstIdx < range.m_u32.m_nCount; ++nConstIdx, ++nJournalIdx )
+		{
+			pRegisters[ range.m_u32.m_nStart + nConstIdx ] = m_pSharedState->GetBufferStart()[nJournalIdx & m_pSharedState->m_nBufferMask ];
+		}
+	}
+}
+
+/*
+void FpcPatchState::Reset()
+{
+	m_nEndOfJournalIdx = 0;
+	m_pSharedState->m_nStartRanges = 0;
+}
+*/
+#ifdef _DEBUG
+static int s_nDebugRangeAdd = -1, s_nDebugSetConst = -1;
+#endif
+
+uint FpcPatchState::AddRange( uint32 nStart, uint32 nCount, const float * pData )
+{
+	#ifndef _CERT
+	if( nStart + nCount > job_fpcpatch::MAX_VIRTUAL_CONST_COUNT )
+	{
+		Error( "AddRange(%d..%d) out of range <%d\n", nStart, nCount, int( job_fpcpatch::MAX_VIRTUAL_CONST_COUNT ) );
+	}
+	#endif
+	#ifdef _DEBUG
+	//Assert( s_nDebugRangeAdd != m_nRangesAdded );
+	if( int( s_nDebugSetConst - nStart ) >= 0 && int( s_nDebugSetConst - nStart ) < int( nCount ) )
+	{
+		fltx4 flDebugRegister = LoadUnalignedSIMD( pData + 4 * int( s_nDebugSetConst - nStart ) );
+		DebuggerBreak();
+	}
+	//++m_nRangesAdded;
+	#endif
+	
+	// spin-wait, then V_memcpy range
+	COMPILE_TIME_ASSERT( sizeof( job_fpcpatch::ConstRangeHeader_t ) == 16 );
+	const uint nSpins = 0x1FF;
+	Assert( !( nSpins & ( nSpins + 1 ) ) );
+
+	//
+	//   We need space for nCount + 1 QWords (1 Qword for the ConstRangeHeader_t)
+	//   And we need m_nEndOfJournalIdx !=	m_nStartRanges to distinguish between
+	//   the all-empty and all-full buffers
+	//
+
+	uint nAttempts = 0;
+	for ( ; ; ++nAttempts )
+	{
+		uint32 nStartRanges = m_pSharedState->m_nStartRanges;
+		Assert( uint32( m_nEndOfJournalIdx - nStartRanges ) <= m_nBufferMask + 1 );
+		// compute the new end - start; is it running further than buffer size away?
+		if ( ( m_nEndOfJournalIdx + nCount - ( nStartRanges + m_nBufferMask + 1 ) ) & 0x80000000 )
+		{	// no, the comparison is negative, therefore it's safe to fill it in
+			break;
+		}
+
+		// if ( ( nAttempts & nSpins ) == nSpins )
+		{
+			// the caller prints warning about this stall.
+			sys_timer_usleep( 60 ); // TODO: proper spinwait; proper OS syncronization
+			if( nAttempts == ( 1000000 / 60 ) )
+			{
+				// waiting for a second already ...
+				Warning(
+					"***************************************************************************************************************\n"
+					"* SPU hang in FpcPatchState::AddRange(). Please send this log (including a couple of screens above) to Sergiy *\n"
+				);
+				Msg( "AddRange(%d,%d,%p), ", nStart, nCount, pData );
+				Msg( "SharedState @%p {start=0x%X&0x%X,patch=%X,job=%X},", m_pSharedState, m_pSharedState->m_nStartRanges, m_pSharedState->m_nBufferMask, m_pSharedState->m_nThisStatePatchCounter, m_pSharedState->m_eaThisStateJobDescriptor );
+				Msg( "FpcpState @%p {end=0x%X},", this, this->m_nEndOfJournalIdx );
+				Msg( "SpuGcmShared trace {0x%X,0x%X,0x%X}\n", g_spuGcmShared.m_nFpcPatchCounterOfLastSyncJob, g_spuGcmShared.m_nFpcPatchCounter, g_spuGcmShared.m_nFpcpStartRangesAfterLastSync );
+				
+				Msg( "RSX put=%X, get=%X sysring{put=%X,end=%X}\n", g_spuGcmShared.m_eaGcmControlRegister->put, g_spuGcmShared.m_eaGcmControlRegister->get,
+					g_spuGcmShared.m_sysring.m_nPut, g_spuGcmShared.m_sysring.m_nEnd );
+				
+				Msg( "last JTS ret guard patched @%X, ", *cellGcmGetLabelAddress( GCM_LABEL_DEBUG_FPCP_RING ) );
+				Msg( "ringRsx[%d]:", g_spuGcmShared.m_fpcpRing.m_ringRsx.Count() );
+				for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringRsx.Count(); ++i )
+				{
+					RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringRsx[i];
+					Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
+				}
+				Msg( "\nringSpu[%d]:", g_spuGcmShared.m_fpcpRing.m_ringSpu.Count() );
+				for( int i = 0; i < g_spuGcmShared.m_fpcpRing.m_ringSpu.Count(); ++i )
+				{
+					RsxSpuDoubleRing::Segment_t & segment = g_spuGcmShared.m_fpcpRing.m_ringSpu[i];
+					Msg(" {%X,%p,%s}", segment.m_eaBase, segment.m_pSpuJts, *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_LWSYNC ? "LWSYNC" : *(segment.m_pSpuJts) == CELL_SPURS_JOB_COMMAND_JTS ? "JTS" : "ERROR" );
+				}
+				Msg( "***************************************************************************************************************\n" );
+			}
+		}
+	}
+	// we have enough free buffer to insert stuff
+	job_fpcpatch::ConstRangeHeader_t *hdr = (job_fpcpatch::ConstRangeHeader_t *)AddInternalPtr();
+	hdr->m_u32.m_nStart = nStart;
+	hdr->m_u32.m_nCount = nCount;
+
+	// add constants block
+	AddInternalBlock( pData, nCount );
+	
+	return nAttempts;
+}
+
+#endif
--- a/materialsystem/ps3gcm/fpcpatcher_spu.h
+++ b/materialsystem/ps3gcm/fpcpatcher_spu.h
@@ -0,0 +1,123 @@
+//========= Copyright <20> Valve Corporation, All rights reserved. ====//
+//
+// Fragment Program Constant Patcher: an SPU implementation, V0
+//
+#ifndef PS3_SHADER_CONSTANT_PATCH_SPU_HDR
+#define PS3_SHADER_CONSTANT_PATCH_SPU_HDR
+
+#ifdef _PS3 
+
+#include "vjobs/fpcpatch_shared.h"
+#include <cg/cg.h>
+#include <cg/cgBinary.h>
+
+#ifdef _DEBUG
+//#define DEBUG_FPC_PATCHER
+#endif
+
+
+class FpcPatchState
+{
+public:
+	job_fpcpatch::FpcPatchState_t * m_pSharedState;
+	uint32 m_nEndOfJournalIdx;	// this is PPU-side variable only, written by PPU only
+	fltx4 * GetBufferStart(){ return m_pSharedState->GetBufferStart() ; }           // the buffer start address
+	uint32 m_nBufferMask;       // the number of Qwords in the buffer
+	
+	//#ifdef _DEBUG
+	//int m_nRangesAdded;
+	//#endif
+public:
+	FpcPatchState(){m_pSharedState = NULL;}
+
+	void Init( job_fpcpatch::FpcPatchState_t * pSharedState, uint32 nBufferQwords );
+	void Reset();
+	uint AddRange( uint32 nStart, uint32 nCount, const float * pData );
+	
+	void GetSyncState( fltx4 * pRegisters );
+protected:
+	fltx4 * AddInternalPtr()
+	{
+		fltx4 * pOut = GetBufferStart() + ( m_nEndOfJournalIdx & m_nBufferMask );
+		m_nEndOfJournalIdx++;
+		return pOut;
+	}
+	void AddInternal( const fltx4 f4 )
+	{
+		*AddInternalPtr() = f4;
+	}
+	inline void AddInternalBlock( const void *pBlock, const uint32 numFltx4s )
+	{
+		// Fit the first portion until the end of the buffer, second portion at start
+		uint32 const nCurrentIdx = ( m_nEndOfJournalIdx & m_nBufferMask ); // the start index to copy to
+		uint32 const numFltx4sUntilEnd = ( -nCurrentIdx ) & m_nBufferMask; // number of fltx4's from the nCurrentIdx to the end of the current buffer ring
+		uint32 const numFirstCopy = MIN( numFltx4sUntilEnd, numFltx4s );   // number of fltx4's to copy first
+		memcpy( GetBufferStart() + nCurrentIdx, pBlock, numFirstCopy * sizeof( fltx4 ) );
+		memcpy( GetBufferStart(), ( ( fltx4* ) pBlock ) + numFirstCopy, ( numFltx4s - numFirstCopy ) * sizeof( fltx4 ) );
+		m_nEndOfJournalIdx += numFltx4s;
+	}
+};
+
+
+struct IDirect3DPixelShader9 ;
+class CFragmentProgramConstantPatcher_SPU
+{
+public:
+	CFragmentProgramConstantPatcher_SPU();
+	~CFragmentProgramConstantPatcher_SPU();
+	void InitLocal( void *pBuffer, uint nSize );
+	void Shutdown();
+	
+	// semantics should match cgGLSetFragmentRegisterBlock()
+	void SetFragmentRegisterBlock( uint StartRegister, uint Vector4fCount, const float* pConstantData );
+	
+	// semantics of cgGLBindProgram( pPixelShader->m_pixProgram->m_CGprogram )
+	void BindProgram( const CgBinaryProgram *prog );
+	void BindProgram( const struct IDirect3DPixelShader9 * prog );
+
+	void BeginScene();
+	void EndScene();
+	
+	//job_fpcpatch::FpcPatchState_t * GetSharedState(){return m_state.m_pSharedState; }
+
+	uint GetStateEndOfJournalIdx() { return m_state.m_nEndOfJournalIdx; }
+	uint GetJournalCapacity() const { return m_state.m_nBufferMask + 1; }
+	int GetJournalSpaceUsedSince( uint nMarker )const{ return int( m_state.m_nEndOfJournalIdx - nMarker ); }
+	int GetJournalSpaceLeftSince( uint nMarker )const{ return int( ( m_state.m_nBufferMask + 1 ) - ( m_state.m_nEndOfJournalIdx - nMarker ) ); }
+protected:	
+	void ResetPut();
+
+	void * FpcPatch( const struct CgBinaryProgram * prog, void * pFragmentProgramDestination, uint32 * pJts );
+	void FpcPatch2( const job_fpcpatch2::FpHeader_t * psh, uint nFpDmaSize, void *pPatchedProgram, uint32 * pJts );
+protected:
+	friend class CSpuGcm;
+
+	FpcPatchState m_state;
+
+	uint32* m_pBuffer, *m_pBufferEnd;
+	int m_nIoOffsetDelta; // m_pBuffer + m_nIoOffsetDelta == IO offset usable by RSX
+	uint32 * m_pPutFragmentProgram;
+	uint m_nFpcPatchCounterAtBeginScene; // used for timing
+	uint m_nFpcPatchCounterOfLastSyncJob;
+	uint m_nBufferLocation;// CELL_GCM_LOCATION_MAIN
+	uint m_nFpcPatchCounter, m_nFpcPatchSyncMask;
+	//uint m_nStartRangesAfterLastSync; // this is the index used to upload only the useful constants to SPU
+	bool m_isBufferPassedIn;
+	bool m_bFpcPatchOnPpu, m_bEnableSPU;
+	
+#ifdef DEBUG_FPC_PATCHER
+	void ValidatePatchedProgram( const CgBinaryProgram *prog, void * pPatchedUcode );
+	fltx4 *m_pSyncState;
+	bool m_bTestAlwaysStateSync;
+	bool m_bSync; // don't use JTS, but just patch synchronously (may be more stable with GPAD)
+#endif
+
+
+};
+
+extern CFragmentProgramConstantPatcher_SPU g_pixelShaderPatcher;	// Patches pixel shader constants
+
+
+#endif
+
+#endif
--- a/materialsystem/ps3gcm/gcmconfig.h
+++ b/materialsystem/ps3gcm/gcmconfig.h
@@ -0,0 +1,202 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Configure gcm be inline, unsfae etc....
+// Inlcude (spu or ppu) after cell headers
+//
+//==================================================================================================
+
+#ifndef INCLUDED_GCMCONFIG_H
+#define INCLUDED_GCMCONFIG_H
+
+// Comment in one GCMCONFIG defn as required
+
+//#define GCMCONFIG(x) x##Inline
+#define GCMCONFIG(x) x##UnsafeInline
+//#define GCMCONFIG(x) x 
+
+#define GCM_FUNC( GCM_FUNCTION, ...) GCM_FUNCTION ( gpGcmContext, ##__VA_ARGS__ )
+
+#define cellGcmSetReferenceCommand GCMCONFIG(cellGcmSetReferenceCommand)
+#define cellGcmSetJumpCommand GCMCONFIG(cellGcmSetJumpCommand)
+#define cellGcmSetCallCommand GCMCONFIG(cellGcmSetCallCommand)
+#define cellGcmSetReturnCommand GCMCONFIG(cellGcmSetReturnCommand)
+#define cellGcmSetAntiAliasingControl GCMCONFIG(cellGcmSetAntiAliasingControl)
+#define cellGcmSetWaitLabel GCMCONFIG(cellGcmSetWaitLabel)
+#define cellGcmSetWriteCommandLabel GCMCONFIG(cellGcmSetWriteCommandLabel)
+#define cellGcmSetWriteBackEndLabel GCMCONFIG(cellGcmSetWriteBackEndLabel)
+#define cellGcmSetWriteTextureLabel GCMCONFIG(cellGcmSetWriteTextureLabel)
+#define cellGcmSetTimeStamp GCMCONFIG(cellGcmSetTimeStamp)
+#define cellGcmSetInvalidateZcull GCMCONFIG(cellGcmSetInvalidateZcull)
+#define cellGcmSetAlphaFunc GCMCONFIG(cellGcmSetAlphaFunc)
+#define cellGcmSetBlendColor GCMCONFIG(cellGcmSetBlendColor)
+#define cellGcmSetBlendEquation GCMCONFIG(cellGcmSetBlendEquation)
+#define cellGcmSetBlendFunc GCMCONFIG(cellGcmSetBlendFunc)
+#define cellGcmSetClearSurface GCMCONFIG(cellGcmSetClearSurface)
+#define cellGcmSetClearColor GCMCONFIG(cellGcmSetClearColor)
+#define cellGcmSetClearDepthStencil GCMCONFIG(cellGcmSetClearDepthStencil)
+#define cellGcmSetColorMask GCMCONFIG(cellGcmSetColorMask)
+#define cellGcmSetColorMaskMrt GCMCONFIG(cellGcmSetColorMaskMrt)
+#define cellGcmSetCullFace GCMCONFIG(cellGcmSetCullFace)
+#define cellGcmSetDepthBounds GCMCONFIG(cellGcmSetDepthBounds)
+#define cellGcmSetDepthFunc GCMCONFIG(cellGcmSetDepthFunc)
+#define cellGcmSetDepthMask GCMCONFIG(cellGcmSetDepthMask)
+#define cellGcmSetFrontFace GCMCONFIG(cellGcmSetFrontFace)
+#define cellGcmSetLineWidth GCMCONFIG(cellGcmSetLineWidth)
+#define cellGcmSetLineSmoothEnable GCMCONFIG(cellGcmSetLineSmoothEnable)
+#define cellGcmSetLineStippleEnable GCMCONFIG(cellGcmSetLineStippleEnable)
+#define cellGcmSetLineStipplePattern GCMCONFIG(cellGcmSetLineStipplePattern)
+#define cellGcmSetLogicOp GCMCONFIG(cellGcmSetLogicOp)
+#define cellGcmSetPointSize GCMCONFIG(cellGcmSetPointSize)
+#define cellGcmSetPolygonOffset GCMCONFIG(cellGcmSetPolygonOffset)
+#define cellGcmSetPolySmoothEnable GCMCONFIG(cellGcmSetPolySmoothEnable)
+#define cellGcmSetPolygonStippleEnable GCMCONFIG(cellGcmSetPolygonStippleEnable)
+#define cellGcmSetPolygonStipplePattern GCMCONFIG(cellGcmSetPolygonStipplePattern)
+#define cellGcmSetFrontPolygonMode GCMCONFIG(cellGcmSetFrontPolygonMode)
+#define cellGcmSetBackPolygonMode GCMCONFIG(cellGcmSetBackPolygonMode)
+#define cellGcmSetScissor GCMCONFIG(cellGcmSetScissor)
+#define cellGcmSetShadeMode GCMCONFIG(cellGcmSetShadeMode)
+#define cellGcmSetTwoSideLightEnable GCMCONFIG(cellGcmSetTwoSideLightEnable)
+#define cellGcmSetStencilFunc GCMCONFIG(cellGcmSetStencilFunc)
+#define cellGcmSetBackStencilFunc GCMCONFIG(cellGcmSetBackStencilFunc)
+#define cellGcmSetStencilMask GCMCONFIG(cellGcmSetStencilMask)
+#define cellGcmSetBackStencilMask GCMCONFIG(cellGcmSetBackStencilMask)
+#define cellGcmSetStencilOp GCMCONFIG(cellGcmSetStencilOp)
+#define cellGcmSetBackStencilOp GCMCONFIG(cellGcmSetBackStencilOp)
+#define cellGcmSetZMinMaxControl GCMCONFIG(cellGcmSetZMinMaxControl)
+#define cellGcmSetAlphaTestEnable GCMCONFIG(cellGcmSetAlphaTestEnable)
+#define cellGcmSetBlendEnable GCMCONFIG(cellGcmSetBlendEnable)
+#define cellGcmSetBlendEnableMrt GCMCONFIG(cellGcmSetBlendEnableMrt)
+#define cellGcmSetLogicOpEnable GCMCONFIG(cellGcmSetLogicOpEnable)
+#define cellGcmSetCullFaceEnable GCMCONFIG(cellGcmSetCullFaceEnable)
+#define cellGcmSetDepthBoundsTestEnable GCMCONFIG(cellGcmSetDepthBoundsTestEnable)
+#define cellGcmSetDepthTestEnable GCMCONFIG(cellGcmSetDepthTestEnable)
+#define cellGcmSetDitherEnable GCMCONFIG(cellGcmSetDitherEnable)
+#define cellGcmSetStencilTestEnable GCMCONFIG(cellGcmSetStencilTestEnable)
+#define cellGcmSetTwoSidedStencilTestEnable GCMCONFIG(cellGcmSetTwoSidedStencilTestEnable)
+#define cellGcmSetPolygonOffsetFillEnable GCMCONFIG(cellGcmSetPolygonOffsetFillEnable)
+#define cellGcmSetRestartIndexEnable GCMCONFIG(cellGcmSetRestartIndexEnable)
+#define cellGcmSetPointSpriteControl GCMCONFIG(cellGcmSetPointSpriteControl)
+#define cellGcmSetInvalidateTextureCache GCMCONFIG(cellGcmSetInvalidateTextureCache)
+#define cellGcmSetTextureBorderColor GCMCONFIG(cellGcmSetTextureBorderColor)
+#define cellGcmSetTextureControl GCMCONFIG(cellGcmSetTextureControl)
+#define cellGcmSetTextureOptimization GCMCONFIG(cellGcmSetTextureOptimization)
+#define cellGcmSetCylindricalWrap GCMCONFIG(cellGcmSetCylindricalWrap)
+#define cellGcmSetInvalidateVertexCache GCMCONFIG(cellGcmSetInvalidateVertexCache)
+#define cellGcmSetRestartIndex GCMCONFIG(cellGcmSetRestartIndex)
+#define cellGcmSetVertexData4f GCMCONFIG(cellGcmSetVertexData4f)
+#define cellGcmSetFrequencyDividerOperation GCMCONFIG(cellGcmSetFrequencyDividerOperation)
+#define cellGcmSetTransformBranchBits GCMCONFIG(cellGcmSetTransformBranchBits)
+#define cellGcmSetVertexAttribInputMask GCMCONFIG(cellGcmSetVertexAttribInputMask)
+#define cellGcmSetFragmentProgramGammaEnable GCMCONFIG(cellGcmSetFragmentProgramGammaEnable)
+#define cellGcmSetRenderEnable GCMCONFIG(cellGcmSetRenderEnable)
+#define cellGcmSetZpassPixelCountEnable GCMCONFIG(cellGcmSetZpassPixelCountEnable)
+#define cellGcmSetClearReport GCMCONFIG(cellGcmSetClearReport)
+#define cellGcmSetReport GCMCONFIG(cellGcmSetReport)
+#define cellGcmSetZcullStatsEnable GCMCONFIG(cellGcmSetZcullStatsEnable)
+#define cellGcmSetZcullControl GCMCONFIG(cellGcmSetZcullControl)
+#define cellGcmSetZcullLimit GCMCONFIG(cellGcmSetZcullLimit)
+#define cellGcmSetScullControl GCMCONFIG(cellGcmSetScullControl)
+#define cellGcmSetVertexTextureAddress GCMCONFIG(cellGcmSetVertexTextureAddress)
+#define cellGcmSetVertexTextureFilter GCMCONFIG(cellGcmSetVertexTextureFilter)
+#define cellGcmSetVertexTextureControl GCMCONFIG(cellGcmSetVertexTextureControl)
+#define cellGcmSetVertexTextureBorderColor GCMCONFIG(cellGcmSetVertexTextureBorderColor)
+#define cellGcmSetPerfMonTrigger GCMCONFIG(cellGcmSetPerfMonTrigger)
+#define cellGcmSetFogMode GCMCONFIG(cellGcmSetFogMode)
+#define cellGcmSetFogParams GCMCONFIG(cellGcmSetFogParams)
+#define cellGcmSetTransferLocation GCMCONFIG(cellGcmSetTransferLocation)
+#define cellGcmSetDepthFormat GCMCONFIG(cellGcmSetDepthFormat)
+#define cellGcmSetBlendOptimization GCMCONFIG(cellGcmSetBlendOptimization)
+#define cellGcmSetPolygonOffsetLineEnable GCMCONFIG(cellGcmSetPolygonOffsetLineEnable)
+#define cellGcmSetVertexAttribOutputMask GCMCONFIG(cellGcmSetVertexAttribOutputMask)
+#define cellGcmSetTextureRemap GCMCONFIG(cellGcmSetTextureRemap)
+#define cellGcmSetVertexProgramStartSlot GCMCONFIG(cellGcmSetVertexProgramStartSlot)
+#define cellGcmSetVertexProgramRegisterCount GCMCONFIG(cellGcmSetVertexProgramRegisterCount)
+#define cellGcmSetTransferDataMode GCMCONFIG(cellGcmSetTransferDataMode)
+#define cellGcmSetDrawBegin GCMCONFIG(cellGcmSetDrawBegin)
+#define cellGcmSetDrawEnd GCMCONFIG(cellGcmSetDrawEnd)
+#define cellGcmSetVertexDataArrayFormat GCMCONFIG(cellGcmSetVertexDataArrayFormat)
+#define cellGcmSetVertexDataArrayOffset GCMCONFIG(cellGcmSetVertexDataArrayOffset)
+#define cellGcmSetUpdateFragmentProgramParameterLocation GCMCONFIG(cellGcmSetUpdateFragmentProgramParameterLocation)
+#define cellGcmSetVertexDataBase GCMCONFIG(cellGcmSetVertexDataBase)
+#define cellGcmSetFragmentProgramOffset GCMCONFIG(cellGcmSetFragmentProgramOffset)
+#define cellGcmSetFragmentProgramControl GCMCONFIG(cellGcmSetFragmentProgramControl)
+#define cellGcmSetClearZcullSurface GCMCONFIG(cellGcmSetClearZcullSurface)
+#define cellGcmSetZcullEnable GCMCONFIG(cellGcmSetZcullEnable)
+#define cellGcmSetUserCommand GCMCONFIG(cellGcmSetUserCommand)
+#define cellGcmSetReportLocation GCMCONFIG(cellGcmSetReportLocation)
+#define cellGcmSetNotifyIndex GCMCONFIG(cellGcmSetNotifyIndex)
+#define cellGcmSetNotify GCMCONFIG(cellGcmSetNotify)
+#define cellGcmSetTextureFilter GCMCONFIG(cellGcmSetTextureFilter)
+#define cellGcmSetTextureAddress GCMCONFIG(cellGcmSetTextureAddress)
+#define cellGcmSetUserClipPlaneControl GCMCONFIG(cellGcmSetUserClipPlaneControl)
+#define cellGcmSetAnisoSpread GCMCONFIG(cellGcmSetAnisoSpread)
+#define cellGcmSetNopCommand GCMCONFIG(cellGcmSetNopCommand)
+#define cellGcmSetSkipNop GCMCONFIG(cellGcmSetSkipNop)
+#define cellGcmReserveMethodSize GCMCONFIG(cellGcmReserveMethodSize)
+#define cellGcmSetWriteBackEndLabelForConditional GCMCONFIG(cellGcmSetWriteBackEndLabelForConditional)
+#define cellGcmSetWriteTextureLabelForConditional GCMCONFIG(cellGcmSetWriteTextureLabelForConditional)
+#define cellGcmSetVertexProgram GCMCONFIG(cellGcmSetVertexProgram)
+#define cellGcmSetFragmentProgramLoadLocation GCMCONFIG(cellGcmSetFragmentProgramLoadLocation)
+#define cellGcmSetVertexProgramLoad GCMCONFIG(cellGcmSetVertexProgramLoad)
+#define cellGcmSetVertexProgramLoadSlot GCMCONFIG(cellGcmSetVertexProgramLoadSlot)
+#define cellGcmSetVertexProgramConstants GCMCONFIG(cellGcmSetVertexProgramConstants)
+#define cellGcmSetVertexProgramParameterBlock GCMCONFIG(cellGcmSetVertexProgramParameterBlock)
+#define cellGcmSetVertexDataArray GCMCONFIG(cellGcmSetVertexDataArray)
+#define cellGcmSetTextureBorder GCMCONFIG(cellGcmSetTextureBorder)
+#define cellGcmSetWaitFlip GCMCONFIG(cellGcmSetWaitFlip)
+#define cellGcmSetFragmentProgramParameterPointer GCMCONFIG(cellGcmSetFragmentProgramParameterPointer)
+#define cellGcmSetFragmentProgramParameter GCMCONFIG(cellGcmSetFragmentProgramParameter)
+#define cellGcmSetFragmentProgram GCMCONFIG(cellGcmSetFragmentProgram)
+#define cellGcmSetVertexProgramParameter GCMCONFIG(cellGcmSetVertexProgramParameter)
+#define cellGcmSetFragmentProgramLoad GCMCONFIG(cellGcmSetFragmentProgramLoad)
+#define cellGcmSetUpdateFragmentProgramParameter GCMCONFIG(cellGcmSetUpdateFragmentProgramParameter)
+#define cellGcmSetTextureFilterSigned GCMCONFIG(cellGcmSetTextureFilterSigned)
+#define cellGcmSetClipMinMax GCMCONFIG(cellGcmSetClipMinMax)
+#define cellGcmSetViewport GCMCONFIG(cellGcmSetViewport)
+#define cellGcmSetTextureAddressAnisoBiasRemap GCMCONFIG(cellGcmSetTextureAddressAnisoBiasRemap)
+#define cellGcmSetTextureAddressAnisoBias GCMCONFIG(cellGcmSetTextureAddressAnisoBias)
+#define cellGcmSetTexture GCMCONFIG(cellGcmSetTexture)
+#define cellGcmSetVertexTexture GCMCONFIG(cellGcmSetVertexTexture)
+#define cellGcmSetSurface GCMCONFIG(cellGcmSetSurface)
+#define cellGcmSetSurfaceWindow GCMCONFIG(cellGcmSetSurfaceWindow)
+#define cellGcmSetInlineTransfer GCMCONFIG(cellGcmSetInlineTransfer)
+#define cellGcmInlineTransfer GCMCONFIG(cellGcmInlineTransfer)
+#define cellGcmSetTransferImage GCMCONFIG(cellGcmSetTransferImage)
+#define cellGcmTransferData GCMCONFIG(cellGcmTransferData)
+#define cellGcmSetTransferData GCMCONFIG(cellGcmSetTransferData)
+#define cellGcmSetConvertSwizzleFormat GCMCONFIG(cellGcmSetConvertSwizzleFormat)
+#define cellGcmSetInlineTransferPointer GCMCONFIG(cellGcmSetInlineTransferPointer)
+#define cellGcmSetTransferDataFormat GCMCONFIG(cellGcmSetTransferDataFormat)
+#define cellGcmSetTransferDataOffset GCMCONFIG(cellGcmSetTransferDataOffset)
+#define cellGcmSetTransferScaleMode GCMCONFIG(cellGcmSetTransferScaleMode)
+#define cellGcmSetTransferScaleSurface GCMCONFIG(cellGcmSetTransferScaleSurface)
+#define cellGcmSetTransferScaleSwizzle GCMCONFIG(cellGcmSetTransferScaleSwizzle)
+#define cellGcmSetTransferReportData GCMCONFIG(cellGcmSetTransferReportData)
+#define cellGcmSetDrawArrays GCMCONFIG(cellGcmSetDrawArrays)
+#define cellGcmSetDrawIndexArray GCMCONFIG(cellGcmSetDrawIndexArray)
+#define cellGcmSetDrawInlineArray GCMCONFIG(cellGcmSetDrawInlineArray)
+#define cellGcmSetDrawInlineIndexArray32 GCMCONFIG(cellGcmSetDrawInlineIndexArray32)
+#define cellGcmSetDrawInlineIndexArray16 GCMCONFIG(cellGcmSetDrawInlineIndexArray16)
+#define cellGcmSetDrawInlineArrayPointer GCMCONFIG(cellGcmSetDrawInlineArrayPointer)
+#define cellGcmSetVertexProgramConstantsPointer GCMCONFIG(cellGcmSetVertexProgramConstantsPointer)
+#define cellGcmSetDrawInlineIndexArray32Pointer GCMCONFIG(cellGcmSetDrawInlineIndexArray32Pointer)
+#define cellGcmSetDrawInlineIndexArray16Pointer GCMCONFIG(cellGcmSetDrawInlineIndexArray16Pointer)
+#define cellGcmSetVertexProgramParameterBlockPointer GCMCONFIG(cellGcmSetVertexProgramParameterBlockPointer)
+#define cellGcmSetWaitForIdle GCMCONFIG(cellGcmSetWaitForIdle)
+#define cellGcmSetVertexData3f GCMCONFIG(cellGcmSetVertexData3f)
+#define cellGcmSetVertexData2f GCMCONFIG(cellGcmSetVertexData2f)
+#define cellGcmSetVertexData1f GCMCONFIG(cellGcmSetVertexData1f)
+#define cellGcmSetVertexData4s GCMCONFIG(cellGcmSetVertexData4s)
+#define cellGcmSetVertexDataScaled4s GCMCONFIG(cellGcmSetVertexDataScaled4s)
+#define cellGcmSetVertexData2s GCMCONFIG(cellGcmSetVertexData2s)
+#define cellGcmSetVertexData4ub GCMCONFIG(cellGcmSetVertexData4ub)
+#define cellGcmSetTextureControlAlphaKill GCMCONFIG(cellGcmSetTextureControlAlphaKill)
+#define cellGcmSetNoParanoidTextureFetches GCMCONFIG(cellGcmSetNoParanoidTextureFetches)
+#define cellGcmSetInlineTransferAlignedPointer GCMCONFIG(cellGcmSetInlineTransferAlignedPointer)
+#define cellGcmSetVertexProgramConstantsAlignedPointer GCMCONFIG(cellGcmSetVertexProgramConstantsAlignedPointer)
+#define cellGcmSetVertexProgramParameterBlockAlignedPointer GCMCONFIG(cellGcmSetVertexProgramParameterBlockAlignedPointer)
+#define cellGcmSetDrawInlineArrayAlignedPointer GCMCONFIG(cellGcmSetDrawInlineArrayAlignedPointer)
+#define cellGcmSetDrawInlineIndexArray32AlignedPointer GCMCONFIG(cellGcmSetDrawInlineIndexArray32AlignedPointer)
+#define cellGcmSetDrawInlineIndexArray16AlignedPointer GCMCONFIG(cellGcmSetDrawInlineIndexArray16AlignedPointer)
+
+#endif // INCLUDED_GCMCONFIG_H
--- a/materialsystem/ps3gcm/gcmdraw_spu.cpp
+++ b/materialsystem/ps3gcm/gcmdraw_spu.cpp
@@ -0,0 +1,273 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+//--------------------------------------------------------------------------------------------------
+// Inlcudes
+//--------------------------------------------------------------------------------------------------
+
+#include <libsn_spu.h>
+#include "SpuMgr_spu.h"
+#include "gcmdraw_spu.h"
+#include "gcmdrawstate.h"
+#include "gcmstate.h"
+
+//--------------------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------------------
+
+ALIGN16 VertexShader9Data_t					gVertexShaderData ALIGN16_POST;
+ALIGN16 PixelShader9Data_t					gPixelShaderData ALIGN16_POST;
+
+ALIGN16 CellGcmContextData					gGcmContext ALIGN16_POST ALIGN16_POST;
+
+ALIGN16 CPs3gcmGlobalState					g_ps3gcmGlobalState ALIGN16_POST ;
+
+ALIGN16 CPs3gcmTextureLayout::Format_t		g_ps3texFormats[PS3_TEX_MAX_FORMAT_COUNT] ALIGN16_POST;
+
+ALIGN16 IDirect3DVertexDeclaration9			gDecl ALIGN16_POST;
+
+ALIGN16 CellGcmContextData*					gpGcmContext = &gGcmContext ALIGN16_POST;
+
+ALIGN16 uint8								gFp[0x2000] ALIGN16_POST;
+ALIGN16 uint8								gVp[0x2000] ALIGN16_POST;
+
+ALIGN16 CPs3gcmTextureLayout				gaLayout[D3D_MAX_TEXTURES] ALIGN16_POST;
+
+ALIGN16 uint8								gaECB[3][0x1000];
+
+ALIGN16 CPs3gcmLocalMemoryBlock				gLmBlock ALIGN16_POST;
+
+int gEA;
+
+
+//--------------------------------------------------------------------------------------------------
+// Routine to DMA in texture Layouts
+//--------------------------------------------------------------------------------------------------
+
+void GetTextureLayouts()
+{
+	// Loop and DMA in texture layouts
+
+	for (uint32 lp = 0; lp < ARRAYSIZE(gaLayout); lp++)
+	{
+		uintp ea = gpGcmDrawState->m_textures[lp].m_eaLayout;
+		
+		gEA = ea;
+
+		if (ea) gSpuMgr.DmaGetSAFE( &gaLayout[lp], ea, sizeof(CPs3gcmTextureLayout), SPU_DMAGET_TAG );
+	}
+
+	gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+}
+
+//--------------------------------------------------------------------------------------------------
+// main()
+//--------------------------------------------------------------------------------------------------
+
+//--------------------------------------------------------------------------------------------------
+// Protocol 
+//
+// Simplest possible for starters : 
+// PPU sends SPU Mbx the last part of the drawcall to perform.
+// SPU performs it and DMAs down the data. When it's complete it send the PPUMbx the length of the drawcall
+// PPU prepares next packet which waits on PPUMbx completion before sending another.
+//
+// Relies on PPU calling cellGcmReserveMethodSize with 16k, so that the SPU can go ahead and DMA back the 
+// draw..
+//--------------------------------------------------------------------------------------------------
+
+int main(void)
+{
+	gSpuMgr.Init();
+
+	// Initialise SPUs drawstate class
+
+	uint32 eaGcmDrawState;
+	gpGcmDrawState->Init();
+	uint8* pData = gpGcmDrawState->m_pData;
+
+	// Initialise context
+
+	gGcmContext.begin = (uint32*)MemAlloc_AllocAligned(GCM_DS_FIFOPERDRAW * GCM_NUMDRAWCALLS_SPU, 128);
+	gGcmContext.end = gGcmContext.begin + (GCM_DS_FIFOPERDRAW * GCM_NUMDRAWCALLS_SPU)/4;
+	gGcmContext.callback = NULL;
+
+	// Pull in globalstate
+
+	volatile uint32 eagGlobalState;
+
+	gSpuMgr.ReadMailbox( (uint32_t *) &eagGlobalState );
+	gSpuMgr.DmaGetUNSAFE( &g_ps3gcmGlobalState, eagGlobalState, SPUMGR_ALIGN_UP( sizeof(g_ps3gcmGlobalState), 16 ), SPU_DMAGET_TAG );
+	gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+
+	while(1)
+	{
+        uint32 startidx, count, loop;
+		gSpuMgr.ReadMailbox( (uint32_t *) &startidx );
+        count = startidx >>16;
+        startidx &= 0xFFFF;
+
+		gpGcmContext->current = gpGcmContext->begin;
+
+        // Loop over the drawstates
+
+        for (loop = 0; loop < count; loop++)
+        {
+            uint32 idx = (startidx +loop) % GCM_DRAWSTATE_MAX;
+            eaGcmDrawState = g_ps3gcmGlobalState.m_eaDrawStates + (idx*sizeof(CGcmDrawState));
+
+		    // Read drawstate
+
+		    gSpuMgr.DmaGetUNSAFE( gpGcmDrawState, eaGcmDrawState, SPUMGR_ALIGN_UP( DRAWSTATE_SIZEOFDMA, 16 ), SPU_DMAGET_TAG );
+		    gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+
+		    // Read Fixed Data
+
+		    gSpuMgr.DmaGetUNSAFE( &gFixedData[0], uintp(gpGcmDrawState->m_pFixed), SPUMGR_ALIGN_UP(sizeof(gFixedData[0]), 16), SPU_DMAGET_TAG );
+		    gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+		    gpGcmDrawState->m_pFixed = &gFixedData[0];
+
+		    // Read Packed Data
+
+		    uint32* pParam = gpGcmDrawState->m_param;
+
+		    if (gpGcmDrawState->m_cmd & 0x80000000) snPause();
+		    gpGcmDrawState->m_cmd &= 0x7fffffff;
+
+		    uint32	packSize = gpGcmDrawState->m_pDataCursor - gpGcmDrawState->m_pData;
+		    gSpuMgr.DmaGetUNSAFE( pData, uintp(gpGcmDrawState->m_pData), SPUMGR_ALIGN_UP( packSize, 16 ), SPU_DMAGET_TAG );
+		    gpGcmDrawState->m_pData = pData;
+		    gpGcmDrawState->m_pDataCursor = pData + packSize;
+
+		    // DMA in any ECBs we will need...
+    	
+		    for ( uint32 lp = 0; lp < 3; lp++ )
+		    {
+			    if (gpGcmDrawState->m_aECB[lp])
+			    {
+				    gSpuMgr.DmaGetSAFE( gaECB[lp], uintp(gpGcmDrawState->m_aECB[lp]), gpGcmDrawState->m_aSizeECB[lp], SPU_DMAGET_TAG );
+				    gpGcmDrawState->m_aECB[lp] = gaECB[lp];
+			    }
+		    }
+
+		    gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+
+		    // Read Pixel Shader and Vertex Shader
+
+		    if ( (gpGcmDrawState->m_cmd != CmdCommitStates) && (gpGcmDrawState->m_cmd != CmdEndFrame ))
+		    {
+			    if(gpGcmDrawState->m_pVertexShaderData) 
+			    {
+				    gSpuMgr.DmaGetUNSAFE( &gVertexShaderData, uintp(gpGcmDrawState->m_pVertexShaderData), SPUMGR_ALIGN_UP( sizeof(gVertexShaderData), 16 ), SPU_DMAGET_TAG );
+				    gpGcmDrawState->m_pVertexShaderData = &gVertexShaderData;
+
+				    gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+
+				    // FPHeader, UCode, patches etc...
+
+				    uintp ea = uintp(gVertexShaderData.m_pVertexShaderCmdBuffer);
+				    gSpuMgr.DmaGetUNSAFE( &gVp, ea, SPUMGR_ALIGN_UP((gVertexShaderData.m_nVertexShaderCmdBufferWords*4),16), SPU_DMAGET_TAG );
+				    gVertexShaderData.m_pVertexShaderCmdBuffer = (uint32*)gVp;
+
+			    }
+
+			    if(gpGcmDrawState->m_pPixelShaderData) 
+			    {
+				    // PS Data
+
+				    gSpuMgr.DmaGetUNSAFE( &gPixelShaderData, uintp(gpGcmDrawState->m_pPixelShaderData), SPUMGR_ALIGN_UP( sizeof(gPixelShaderData), 16 ), SPU_DMAGET_TAG );
+				    gpGcmDrawState->m_pPixelShaderData = &gPixelShaderData;
+
+				    gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+
+				    // FPHeader, UCode, patches etc...
+    				
+				    uintp ea = uintp(gPixelShaderData.m_eaFp);
+				    gSpuMgr.DmaGetUNSAFE( &gFp, ea, SPUMGR_ALIGN_UP(gPixelShaderData.m_nTotalSize,16), SPU_DMAGET_TAG );
+				    gPixelShaderData.m_eaFp = (FpHeader_t*)gFp;
+			    }
+
+			    // Decl
+
+			    gSpuMgr.DmaGetUNSAFE( &gDecl, uintp(pParam[0]), SPUMGR_ALIGN_UP( sizeof(gDecl), 16 ), SPU_DMAGET_TAG );
+
+			    // Texture Fomrats
+
+			    gSpuMgr.DmaGetUNSAFE( &g_ps3texFormats, uintp(pParam[4]), SPUMGR_ALIGN_UP( sizeof(g_ps3texFormats), 16 ), SPU_DMAGET_TAG );	
+
+			    gSpuMgr.DmaDone( SPU_DMAGET_TAG_WAIT );
+
+
+		    }
+
+		    // Process command
+
+		    switch(gpGcmDrawState->m_cmd)
+		    {
+			    case CmdCommitStates:
+			    case CmdEndFrame:
+				    gpGcmDrawState->CommitStates();
+				    break;
+
+			    case CmdDrawPrim:
+				    gpGcmDrawState->CommitAll(&gDecl, pParam[1]);
+
+				    // Draw
+
+				    GCM_FUNC( cellGcmSetDrawIndexArray, 
+					    pParam[2], pParam[5],
+					    CELL_GCM_DRAW_INDEX_ARRAY_TYPE_16, CELL_GCM_LOCATION_LOCAL,
+					    pParam[3] );
+
+
+
+				    break;
+
+			    case CmdDrawPrimUP:
+				    {
+					    D3DStreamDesc &dsd = g_dxGcmVertexStreamSources[0];
+
+					    dsd.m_offset = 0;
+					    dsd.m_stride = pParam[2];
+					    dsd.m_vtxBuffer = ( IDirect3DVertexBuffer9 * )( uintp )1; // invalid pointer, but non-NULL to signal it's a real vertex buffer;
+					    dsd.m_nLocalBufferOffset = 0;
+
+					    gpGcmDrawState->CommitAll(&gDecl, 0);
+					    GCM_FUNC(cellGcmSetCallCommand, pParam[1]);
+
+				    }
+
+				    break;
+
+		    }
+
+        }   // End Loop over drawstates
+
+		// DMA out packet
+		// first fill context to a 16B boundary
+
+		while (uintp(gpGcmContext->current) & 0xf)
+		{
+			*gpGcmContext->current = 0;
+			gpGcmContext->current++;
+		}
+
+		// Send to fifo
+
+		uint32 bytesUsed = (uint8*)gpGcmContext->current - (uint8*)gpGcmContext->begin;
+
+		gSpuMgr.DmaSync();
+		gSpuMgr.DmaPut(gpGcmDrawState->m_eaOutputFIFO, (void*)gpGcmContext->begin, 
+					   bytesUsed, SPU_DMAPUT_TAG);
+		gSpuMgr.DmaDone(SPU_DMAPUT_TAG_WAIT);
+
+		// Send to SPU mailbox
+
+		gSpuMgr.WriteMailbox(gpGcmDrawState->m_eaOutputFIFO + bytesUsed);
+
+	}
+
+}
--- a/materialsystem/ps3gcm/gcmdraw_spu.h
+++ b/materialsystem/ps3gcm/gcmdraw_spu.h
@@ -0,0 +1,33 @@
+#ifndef INCLUDED_GCMDRAW_SPU_H
+#define INCLUDED_GCMDRAW_SPU_H
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+//--------------------------------------------------------------------------------------------------
+// Headers
+//--------------------------------------------------------------------------------------------------
+
+#ifdef SPU
+#include "SpuMgr_spu.h"
+#else
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "cell\gcm.h"
+#include "SpuMgr_ppu.h"
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// Defines for the DMA tags
+//--------------------------------------------------------------------------------------------------
+
+#define SPU_DMAGET_TAG						0
+#define SPU_DMAGET_TAG_WAIT					( 1 << SPU_DMAGET_TAG )
+
+#define SPU_DMAPUT_TAG						1
+#define SPU_DMAPUT_TAG_WAIT					( 1 << SPU_DMAPUT_TAG )
+
+
+#endif // INCLUDED_GCMDRAW_SPU_H
--- a/materialsystem/ps3gcm/gcmdrawstate.cpp
+++ b/materialsystem/ps3gcm/gcmdrawstate.cpp
@@ -0,0 +1,708 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Per draw call gcm state
+//
+//==================================================================================================
+
+#define PPU_DRAW 0
+
+#ifndef SPU
+#define CELL_GCM_MEMCPY memcpy						// PPU SNC has no such intrinsic
+#endif
+
+
+#ifndef SPU
+
+#include "sys/memory.h"
+#include "sysutil/sysutil_sysparam.h"
+#include "cell/sysmodule.h"
+
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "tier1/utlbuffer.h"
+
+#include "cell/gcm.h"
+#include "gcmconfig.h"
+#include "ps3gcmmemory.h"
+#include "gcmstate.h"
+#include "gcmlabels.h"
+#include "gcmdrawstate.h"
+
+#include "ps3/ps3_helpers.h"
+
+#include <materialsystem/imaterialsystem.h>
+
+#include <vprof.h>
+
+#include "tier0/memdbgon.h"
+
+#else
+
+#include "spumgr_spu.h"
+#include "gcmdrawstate.h"
+
+#endif 
+
+//--------------------------------------------------------------------------------------------------
+// Globals
+//--------------------------------------------------------------------------------------------------
+
+ALIGN128 CGcmDrawState gGcmDrawState[GCM_DRAWSTATE_MAX] ALIGN128_POST;
+CGcmDrawState* gpGcmDrawState = &gGcmDrawState[0];
+
+int g_bZcullAuto					= 1;
+int g_nZcullDefault					= 100;
+int g_nZcullMoveForward				= 100;
+int g_nZcullPushBack				= 100;
+
+SetVertexDataArrayCache_t			g_cacheSetVertexDataArray[ D3D_MAX_STREAMS ];
+vec_float4							g_aFPConst[GCM_DS_MAXFPCONST] = {0,};
+vec_float4							g_aVPConst[GCM_DS_MAXVPCONST] = {0,};
+D3DStreamDesc						g_dxGcmVertexStreamSources[D3D_MAX_STREAMS];
+
+uint32								g_UPHigh = 0;
+uint32								g_UPFrame;
+
+#ifndef SPU
+ALIGN16 uint8						g_aDynECB[GCM_DS_MAXDYNECB] ALIGN16_POST;			// Ring buffer of dynamic cmds
+uint32								g_nDynECBIdx = 0;
+#endif
+
+#ifndef SPU
+ALIGN128 CGcmDrawState::FixedData	gFixedData[GCM_DRAWSTATE_MAX] ALIGN128_POST;
+#else
+ALIGN128 CGcmDrawState::FixedData	gFixedData[1] ALIGN128_POST;
+#endif
+
+#ifndef SPU
+ALIGN128 uint8	gPackData[GCM_DRAWSTATE_MAX][GCM_DS_MAXDATAPERDRAWCALL] ALIGN128_POST;
+#else
+ALIGN128 uint8	gPackData[1][GCM_DS_MAXDATAPERDRAWCALL] ALIGN128_POST;
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// DX lookups etc..
+//--------------------------------------------------------------------------------------------------
+
+// THese tables are auto-generated in dxabstract.cpp, UnpackD3DRSITable()
+// They provide renderstate classes and their default values....
+uint8 g_d3drs_defvalue_indices[D3DRS_VALUE_LIMIT] =
+{ 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0300 | 1, 0300 | 2, 0100 | 3, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0300 | 1, 0300 | 0, 0100 | 4, 0000 | 0, 0000 | 0, 0300 | 1, 0300 | 1, 0000 | 0, 0300 | 2, 0300 | 1, 0300 | 0, 0300 | 5, 0100 | 0, 0300 | 1, 0300 | 0, 0100 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0300 | 0, 0300 | 0, 0300 | 0, 0300 | 6, 0300 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0300 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0300 | 0, 0300 | 4, 0300 | 4, 0300 | 4, 0300 | 7, 0300 | 0, 0300 | 8, 0300 | 8, 0100 | 8, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0000 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0200 | 4, 0000 | 0, 0000 | 0, 0100 | 0, 0300 | 0, 0100 | 4, 0100 | 4, 0100 | 0, 0000 | 0, 0100 | 0, 0100 | 3, 0100 | 0, 0100 | 0, 0000 | 0, 0000 | 0, 0100 | 0, 0300 | 0, 0000 | 0, 0100 | 6, 0100 | 6, 0100 | 0, 0100 | 0, 0100 | 6, 0100 | 0, 0100 | 0, 0300 | 4, 0300 | 8, 0100 | 0, 0000 | 0, 0100 | 0, 0100 | 9, 0100 | 0, 0300 | 4, 0000 | 0, 0100 | 0, 0300 | 4, 0100 | 2, 0100 | 4, 0300 | 0, 0300 | 0, 0100 | 0, 0000 | 0, 0100 | 6, 0100 | 6, 0100 | 0, 0100 | 0, 0100 | 6, 0100 | 0, 0100 | 0, 0300 | 0, 0300 | 4, 0300 | 4, 0300 | 4, 0300 | 7, 0300 | 10, 0300 | 10, 0300 | 10, 0100 | 8, 0300 | 0, 0300 | 0, 0000 | 0, 0000 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 0, 0100 | 3, 0100 | 4, 0100 | 4};
+uint32 g_d3drs_defvalues[11] =
+{ 0x0, 0x31415926, 0x3, 0x2, 0x1, 0x7, 0x3F800000, 0x8, 0xFFFFFFFF, 0x42800000, 0xF };
+
+uint16 dxtogl_blendop[7] =
+{
+	/*invalid*/CELL_GCM_FUNC_ADD,
+	CELL_GCM_FUNC_ADD,
+	CELL_GCM_FUNC_SUBTRACT,
+	CELL_GCM_FUNC_REVERSE_SUBTRACT,
+	CELL_GCM_MIN,
+	CELL_GCM_MAX,
+	/*invalid*/CELL_GCM_FUNC_ADD,
+};
+
+uint32 dxtogl_stencilmode[10] =
+{
+	/*invalid*/					CELL_GCM_KEEP,
+	/*D3DSTENCILOP_KEEP*/		CELL_GCM_KEEP,
+	/*D3DSTENCILOP_ZERO*/		CELL_GCM_ZERO,
+	/*D3DSTENCILOP_REPLACE*/	CELL_GCM_REPLACE,
+	/*D3DSTENCILOP_INCRSAT*/	CELL_GCM_INCR,
+	/*D3DSTENCILOP_DECRSAT*/	CELL_GCM_DECR,
+	/*D3DSTENCILOP_INVERT*/		CELL_GCM_INVERT,
+	/*D3DSTENCILOP_INCR*/		CELL_GCM_INCR_WRAP,
+	/*D3DSTENCILOP_DECR*/		CELL_GCM_DECR_WRAP,
+	/*invalid*/					CELL_GCM_KEEP,
+};
+
+
+// addressing modes
+// 1 D3DTADDRESS_WRAP		Tile the texture at every integer junction.
+//   D3DTADDRESS_MIRROR	Similar to D3DTADDRESS_WRAP, except that the texture is flipped at every integer junction.
+// 3 D3DTADDRESS_CLAMP	Texture coordinates outside the range [0.0, 1.0] are set to the texture color at 0.0 or 1.0, respectively.
+// 4 D3DTADDRESS_BORDER	Texture coordinates outside the range [0.0, 1.0] are set to the border color.
+//   D3DTADDRESS_MIRRORONCE Similar to D3DTADDRESS_MIRROR and D3DTADDRESS_CLAMP.
+//						Takes the absolute value of the texture coordinate (thus, mirroring around 0),
+//						and then clamps to the maximum value. The most common usage is for volume textures,
+//						where support for the full D3DTADDRESS_MIRRORONCE texture-addressing mode is not
+//						necessary, but the data is symmetric around the one axis.
+uint8 dxtogl_addressMode[6] = 
+{
+	CELL_GCM_TEXTURE_WRAP,				// no zero entry
+	CELL_GCM_TEXTURE_WRAP,				// from D3DTADDRESS_WRAP
+	CELL_GCM_TEXTURE_MIRROR,			// from D3DTADDRESS_MIRROR
+	CELL_GCM_TEXTURE_CLAMP_TO_EDGE,		// from D3DTADDRESS_CLAMP
+	CELL_GCM_TEXTURE_BORDER,			// from D3DTADDRESS_BORDER
+	CELL_GCM_TEXTURE_MIRROR_ONCE_BORDER, // no D3DTADDRESS_MIRRORONCE support
+};
+
+
+uint8 dxtogl_anisoIndexHalf[32] = // indexed by [ dxsamp->maxAniso / 2 ]
+{
+	CELL_GCM_TEXTURE_MAX_ANISO_1, // 0-1
+	CELL_GCM_TEXTURE_MAX_ANISO_2, // 2-3
+	CELL_GCM_TEXTURE_MAX_ANISO_4, // 4-5
+	CELL_GCM_TEXTURE_MAX_ANISO_6, // 6-7
+	CELL_GCM_TEXTURE_MAX_ANISO_8, // 8-9
+	CELL_GCM_TEXTURE_MAX_ANISO_10, // 10-11
+	CELL_GCM_TEXTURE_MAX_ANISO_12, // 12-13
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // 14-15
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 16
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 18
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 20
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 22
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 24
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 26
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 28
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 30
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 32
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 34
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 36
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 38
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 40
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 42
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 44
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 46
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 48
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 50
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 52
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 54
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 56
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 58
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 60
+	CELL_GCM_TEXTURE_MAX_ANISO_16, // ... rest >= 62
+};
+
+uint8 dxtogl_minFilter[4][4] =		// indexed by _D3DTEXTUREFILTERTYPE on both axes: [row is min filter][col is mip filter]. 
+{
+	/*  mip filter ---------------> D3DTEXF_NONE	D3DTEXF_POINT				D3DTEXF_LINEAR				(D3DTEXF_ANISOTROPIC not applicable to mip filter) */
+	/* min = D3DTEXF_NONE */		{	CELL_GCM_TEXTURE_NEAREST,		CELL_GCM_TEXTURE_NEAREST_NEAREST,	CELL_GCM_TEXTURE_NEAREST_LINEAR,	CELL_GCM_TEXTURE_NEAREST	},		// D3DTEXF_NONE we just treat like POINT
+	/* min = D3DTEXF_POINT */		{	CELL_GCM_TEXTURE_NEAREST,		CELL_GCM_TEXTURE_NEAREST_NEAREST,	CELL_GCM_TEXTURE_NEAREST_LINEAR,	CELL_GCM_TEXTURE_NEAREST	},
+	/* min = D3DTEXF_LINEAR */		{	CELL_GCM_TEXTURE_LINEAR,		CELL_GCM_TEXTURE_LINEAR_NEAREST,	CELL_GCM_TEXTURE_LINEAR_LINEAR,		CELL_GCM_TEXTURE_NEAREST	},
+	/* min = D3DTEXF_ANISOTROPIC */	{	CELL_GCM_TEXTURE_LINEAR,		CELL_GCM_TEXTURE_LINEAR_NEAREST,	CELL_GCM_TEXTURE_LINEAR_LINEAR,		CELL_GCM_TEXTURE_NEAREST	},		// no diff from prior row, set maxAniso to effect the sampling
+};
+
+uint8 dxtogl_magFilter[4] =		// indexed by _D3DTEXTUREFILTERTYPE
+{
+	CELL_GCM_TEXTURE_NEAREST,				// D3DTEXF_NONE not applicable to mag filter but we handle it like POINT (mat_showmiplevels hits this)
+	CELL_GCM_TEXTURE_NEAREST,				// D3DTEXF_POINT
+	CELL_GCM_TEXTURE_LINEAR,				// D3DTEXF_LINEAR
+	CELL_GCM_TEXTURE_LINEAR,				// D3DTEXF_ANISOTROPIC (aniso will be driven by setting maxAniso, not by a GL filter mode)
+};
+
+//--------------------------------------------------------------------------------------------------
+// Send to SPU
+//--------------------------------------------------------------------------------------------------
+
+#ifndef SPU
+
+int gSpuJobIssued = 0;
+uint32 gSpuStartIdx = 0;
+uint32 gSpuCount = 0;
+
+//--------------------------------------------------------------------------------------------------
+// SPU DRAW CODE
+//--------------------------------------------------------------------------------------------------
+
+#if !PPU_DRAW
+
+void	CGcmDrawState::SendToSpu()
+{
+	SpuTaskHandle *pTask = &g_ps3gcmGlobalState.m_spuHandle;
+    
+    // Get this drawcall indx and the next
+    uint32 idx = gpGcmDrawState - gGcmDrawState;
+    uint32 nextidx = (idx + 1) % GCM_DRAWSTATE_MAX;
+    gSpuCount ++;
+
+	// Move gpGcmDrawState to the next set of Data
+	
+	CGcmDrawState* pPrevDrawState = gpGcmDrawState;
+	gpGcmDrawState = &gGcmDrawState[nextidx];
+	
+	gpGcmDrawState->m_shaderVxConstants = pPrevDrawState->m_shaderVxConstants;
+	gpGcmDrawState->m_pPixelShaderData = pPrevDrawState->m_pPixelShaderData;
+	gpGcmDrawState->m_pVertexShaderData = pPrevDrawState->m_pVertexShaderData;
+
+	gpGcmDrawState->m_nBackBufferSize[0] = pPrevDrawState->m_nBackBufferSize[0];
+	gpGcmDrawState->m_nBackBufferSize[1] = pPrevDrawState->m_nBackBufferSize[1];
+
+	gpGcmDrawState->m_pDataCursor = gpGcmDrawState->m_pData;
+
+	gpGcmDrawState->m_dirtySamplersMask = 0;
+	gpGcmDrawState->m_dirtyCachesMask = 0;
+	gpGcmDrawState->m_dirtyStatesMask = 0;
+
+	gpGcmDrawState->m_nFreeLabel = 0;
+
+	memset(gpGcmDrawState->m_pFixed->m_aSamplerIdx, 0xff, sizeof(m_pFixed->m_aSamplerIdx));
+	gpGcmDrawState->m_pFixed->m_nSampler = 0;
+	gpGcmDrawState->m_pFixed->m_nInstanced = 0;
+
+	gpGcmDrawState->m_nNumECB = 0;
+	memset(gpGcmDrawState->m_aECB, 0, sizeof(m_aECB));
+
+    if ( (gSpuCount < 4) && (m_cmd != CmdEndFrame) ) return;
+ 
+    // Send the state(s) to the SPU
+
+    // Wait on previous drawcall
+
+    if (gSpuJobIssued)
+    {
+        uint32 fifoPosn;
+        gSpuMgr.ReadMailbox(pTask, &fifoPosn);
+        gpGcmContext->current = (uint32*)fifoPosn;
+    }
+
+    // Makesure we have 16K at least, per drawcall (we issue 4 calls at a time)
+    cellGcmReserveMethodSizeInline(gpGcmContext, (GCM_DS_FIFOPERDRAW*GCM_NUMDRAWCALLS_SPU)/4);			// 16K per draw call, /4 because api takes wordcount
+
+    // Makesure FIFO is on a 16B boundary
+
+    while (uintp(gpGcmContext->current) & 0xf)
+    {
+        *gpGcmContext->current = 0;
+        gpGcmContext->current++;
+    }
+
+    // Build count and startidx parameter to send to SPU
+    uint32 mailboxparam = (gSpuCount<<16) | gSpuStartIdx;
+
+    //Send this drawstate
+    m_eaOutputFIFO = (uint32)gpGcmContext->current;
+    __asm ( "eieio" );
+    gSpuMgr.WriteMailbox(pTask, mailboxparam);
+
+    gSpuJobIssued = 1;
+
+    // If it's an endframe, wait for result now
+    // comment out this if to always wait for the dma to come back
+
+    if (m_cmd == CmdEndFrame)
+    {
+        uint32 fifoPosn;
+        gSpuMgr.ReadMailbox(pTask, &fifoPosn);
+        gpGcmContext->current = (uint32*)fifoPosn;
+        gSpuJobIssued = 0;
+    }
+
+    gSpuStartIdx = nextidx;
+    gSpuCount = 0;
+
+}
+
+#else		// PPU_DRAW.....
+
+//--------------------------------------------------------------------------------------------------
+// Draw on PPU
+//--------------------------------------------------------------------------------------------------
+
+void	CGcmDrawState::SendToSpu()
+{
+
+	// Makesure we have 16K at least
+
+	cellGcmReserveMethodSizeInline(gpGcmContext, GCM_DS_FIFOPERDRAW/4);			// 16K per draw call
+
+	// Makesure FIFO is on a 16B boundary
+
+	while (uintp(gpGcmContext->current) & 0xf)
+	{
+		*gpGcmContext->current = 0;
+		gpGcmContext->current++;
+	}
+
+	// Process cmd on PPU
+
+	switch (m_cmd)
+	{
+
+	case CmdCommitStates:
+	case CmdEndFrame:
+
+		if (m_nFreeLabel) UnpackSetWriteBackEndLabel(GCM_LABEL_MEMORY_FREE, m_nFreeLabel);
+
+		if ( m_dirtyStatesMask & kDirtyResetRsx) UnpackResetRsxState();
+
+		if (m_dirtyStatesMask & kDirtyZeroAllPSConsts) ZeroFPConsts();
+		if (m_dirtyStatesMask & kDirtyZeroAllVSConsts) ZeroVPConsts();
+
+		UnpackData();					// Pulls out pixel shader consts and sets vertex shader consts
+		CommitRenderStates();
+		break;
+
+	case CmdDrawPrim:
+		{
+			gpGcmDrawState->CommitAll((IDirect3DVertexDeclaration9 *)m_param[0], m_param[1]);
+
+			// Draw
+
+			GCM_FUNC( cellGcmSetDrawIndexArray, 
+				m_param[2], m_param[5],
+				CELL_GCM_DRAW_INDEX_ARRAY_TYPE_16, CELL_GCM_LOCATION_LOCAL,
+				m_param[3] );
+		}
+		break;
+
+	case CmdDrawPrimUP:
+		{
+
+			D3DStreamDesc &dsd = g_dxGcmVertexStreamSources[0];
+
+			dsd.m_offset = 0;
+			dsd.m_stride = m_param[2];
+			dsd.m_vtxBuffer = ( IDirect3DVertexBuffer9 * )( uintp )1; // invalid pointer, but non-NULL to signal it's a real vertex buffer;
+			dsd.m_nLocalBufferOffset = 0;
+
+			gpGcmDrawState->CommitAll((IDirect3DVertexDeclaration9 *)m_param[0], 0);
+			GCM_FUNC(cellGcmSetCallCommand, m_param[1]);
+
+		}
+
+		break;
+	}
+
+	// Flip to the other set of Data
+
+	if (gpGcmDrawState->m_pData == gPackData1)
+	{
+		gpGcmDrawState->m_pData = gPackData2;
+		gpGcmDrawState->m_pFixed = &gFixedData2;
+	}
+	else
+	{
+		gpGcmDrawState->m_pData = gPackData1;
+		gpGcmDrawState->m_pFixed = &gFixedData1;
+	}
+
+	gpGcmDrawState->m_pDataCursor = gpGcmDrawState->m_pData;
+
+	m_dirtySamplersMask = 0;
+	m_dirtyCachesMask = 0;
+	m_dirtyStatesMask = 0;
+
+	m_nFreeLabel = 0;
+
+	memset(m_pFixed->m_aSamplerIdx, 0xff, sizeof(m_pFixed->m_aSamplerIdx));
+	m_pFixed->m_nSampler = 0;
+	m_pFixed->m_nInstanced = 0;
+
+	m_nNumECB = 0;
+	memset(m_aECB, 0, sizeof(m_aECB));
+}
+
+#endif		// ndef SPU
+
+#endif
+
+
+
+//--------------------------------------------------------------------------------------------------
+// test func to try to find corrupted ECBs
+//--------------------------------------------------------------------------------------------------
+
+void CGcmDrawState::TestCommandBuffer( uint8 *pCmdBuf )
+{
+	uint8* pStart = pCmdBuf;
+
+	uint8 *pReturnStack[20];
+	uint8 **pSP = &pReturnStack[ARRAYSIZE(pReturnStack)];
+	uint8 *pLastCmd;
+	for(;;)
+	{
+		uint8 *pCmd=pCmdBuf;
+		int nCmd = GetData<int>( pCmdBuf );
+
+		if (nCmd > CBCMD_SET_VERTEX_SHADER_NEARZFARZ_STATE) DebuggerBreak();
+
+		switch( nCmd )
+		{
+		case CBCMD_END:
+			{
+				if ( pSP == &pReturnStack[ARRAYSIZE(pReturnStack)] )
+					return;
+				else
+				{
+					// pop pc
+					pCmdBuf = *( pSP ++ );
+					break;
+				}
+			}
+
+		case CBCMD_JUMP:
+			pCmdBuf = GetData<uint8 *>(  pCmdBuf + sizeof( int ) );
+			break;
+
+		case CBCMD_JSR:
+			{
+				Assert( pSP > &(pReturnStack[0] ) );
+				// 				*(--pSP ) = pCmdBuf + sizeof( int ) + sizeof( uint8 *);
+				// 				pCmdBuf = GetData<uint8 *>(  pCmdBuf + sizeof( int ) );
+				TestCommandBuffer( GetData<uint8 *>(  pCmdBuf + sizeof( int ) ) );
+				pCmdBuf = pCmdBuf + sizeof( int ) + sizeof( uint8 *);
+				break;
+			}
+
+		case CBCMD_SET_PIXEL_SHADER_FLOAT_CONST:
+			{
+				int nStartConst = GetData<int>( pCmdBuf + sizeof( int ) );
+				int nNumConsts = GetData<int>( pCmdBuf + 2 * sizeof( int ) );
+				pCmdBuf += nNumConsts * 4 * sizeof( float ) + 3 * sizeof( int );
+				break;
+			}
+
+
+		case CBCMD_SETPIXELSHADERFOGPARAMS:
+			{
+				Error("Pixel Shader Fog params not supported\n");
+				break;
+			}
+
+		case CBCMD_STORE_EYE_POS_IN_PSCONST:
+			{
+				pCmdBuf += 2 * sizeof( int ) + sizeof( float );
+
+				break;
+			}
+
+		case CBCMD_SET_DEPTH_FEATHERING_CONST:
+			{
+				// 				int nConst = GetData<int>( pCmdBuf + sizeof( int ) );
+				// 				float fDepthBlendScale = GetData<float>( pCmdBuf + 2 * sizeof( int ) );
+				pCmdBuf += 2 * sizeof( int ) + sizeof( float );
+				//				SetDepthFeatheringPixelShaderConstant( nConst, fDepthBlendScale );
+				break;
+			}
+
+		case CBCMD_SET_VERTEX_SHADER_FLOAT_CONST:
+			{
+				int nStartConst = GetData<int>( pCmdBuf + sizeof( int ) );
+				int nNumConsts = GetData<int>( pCmdBuf + 2 * sizeof( int ) );
+				float const *pValues = reinterpret_cast< float const *> ( pCmdBuf + 3 * sizeof( int ) );
+				pCmdBuf += nNumConsts * 4 * sizeof( float ) + 3 * sizeof( int );
+				break;
+			}
+
+
+		case CBCMD_BIND_PS3_TEXTURE:
+			{
+				CPs3BindTexture_t tex = GetData<CPs3BindTexture_t> (pCmdBuf + sizeof( int ));
+				if (tex.m_pLmBlock->Offset() & 0x7e) DebuggerBreak();
+				pCmdBuf += sizeof(int) + sizeof(tex);
+				break;
+			}
+
+
+		case CBCMD_BIND_PS3_STANDARD_TEXTURE:
+			{
+				CPs3BindTexture_t tex = GetData<CPs3BindTexture_t> (pCmdBuf + sizeof( int ));
+
+				if (m_pFixed->m_nInstanced)
+				{
+					uint32 nBindFlags = tex.m_nBindFlags;
+					uint32 nSampler   = tex.m_sampler;
+
+					switch (tex.m_boundStd)
+					{
+					case TEXTURE_LOCAL_ENV_CUBEMAP:
+						if (m_pFixed->m_nInstanced & GCM_DS_INST_ENVMAP) tex = m_pFixed->m_instanceEnvCubemap;
+						break;
+					case TEXTURE_LIGHTMAP:
+						if (m_pFixed->m_nInstanced & GCM_DS_INST_LIGHTMAP) tex = m_pFixed->m_instanceLightmap;
+						break;
+					case TEXTURE_PAINT:
+						if (m_pFixed->m_nInstanced & GCM_DS_INST_PAINTMAP) tex = m_pFixed->m_instancePaintmap;
+						break;
+					}
+
+					tex.m_nBindFlags = nBindFlags;
+					tex.m_sampler    = nSampler;
+				}
+
+				// Test texture
+
+				if (tex.m_pLmBlock->Offset() & 0x7e) DebuggerBreak();
+
+				pCmdBuf += sizeof(int) + sizeof(tex);
+				break;
+			}
+
+
+		case CBCMD_PS3TEX:
+			{
+				pCmdBuf += sizeof(int) + (CBCMD_MAX_PS3TEX*sizeof(int));
+				break;
+			}
+
+		case CBCMD_LENGTH:
+			{
+				pCmdBuf += sizeof(int) *2 ;
+				break;
+			}
+
+		case CBCMD_SET_PSHINDEX:
+			{
+				// 				int nIdx = GetData<int>( pCmdBuf + sizeof( int ) );
+				// 				ShaderManager()->SetPixelShaderIndex( nIdx );
+				// 				pCmdBuf += 2 * sizeof( int );
+
+				Error("PSHINDEX Not Supported\n");
+				break;
+			}
+
+		case CBCMD_SET_VSHINDEX:
+			{
+				// 				int nIdx = GetData<int>( pCmdBuf + sizeof( int ) );
+				// 				ShaderManager()->SetVertexShaderIndex( nIdx );
+				pCmdBuf += 2 * sizeof( int );
+
+				Error("VSHINDEX Not Supported\n");
+				break;
+			}
+
+		case CBCMD_SET_VERTEX_SHADER_FLASHLIGHT_STATE:
+			{
+				// 				int nStartConst = GetData<int>( pCmdBuf + sizeof( int ) );
+				// 				SetVertexShaderConstantInternal( nStartConst, m_FlashlightWorldToTexture.Base(), 4, false );
+				// 				pCmdBuf += 2 * sizeof( int );
+
+				//				Error("Flashlight unsupported\n");
+
+				pCmdBuf += 2 * sizeof( int );
+				break;
+			}
+
+		case CBCMD_SET_VERTEX_SHADER_NEARZFARZ_STATE:
+			{
+				Error("SetVertexShaderNearAndFarZ NOt SUPPORTED\n");
+
+				// 				int nStartConst = GetData<int>( pCmdBuf + sizeof( int ) );
+				// 
+				// 				VMatrix m;
+				// 				
+				// 				m = m_MaterialProjectionMatrix;
+				// 
+				// //				GetMatrix( MATERIAL_PROJECTION, m.m[0] );
+				// 
+				// 				// m[2][2] =  F/(N-F)   (flip sign if RH)
+				// 				// m[3][2] = NF/(N-F)
+				// 
+				// 				float vNearFar[4];
+				// 
+				// 				float N =     m[3][2] / m[2][2];
+				// 				float F = (m[3][2]*N) / (N + m[3][2]);
+				// 
+				// 				vNearFar[0] = N;
+				// 				vNearFar[1] = F;
+				// 
+				// 				SetVertexShaderConstantInternal( nStartConst, vNearFar, 1, false );
+				pCmdBuf += 2 * sizeof( int );
+				break;
+			}
+
+		case CBCMD_SET_PIXEL_SHADER_FLASHLIGHT_STATE:
+			{
+				// 				int nLightSampler		= GetData<int>( pCmdBuf + sizeof( int ) );
+				// 				int nDepthSampler		= GetData<int>( pCmdBuf + 2 * sizeof( int ) );
+				// 				int nShadowNoiseSampler = GetData<int>( pCmdBuf + 3 * sizeof( int ) );
+				// 				int nColorConst			= GetData<int>( pCmdBuf + 4 * sizeof( int ) );
+				// 				int nAttenConst			= GetData<int>( pCmdBuf + 5 * sizeof( int ) );
+				// 				int nOriginConst		= GetData<int>( pCmdBuf + 6 * sizeof( int ) );
+				// 				int nDepthTweakConst	= GetData<int>( pCmdBuf + 7 * sizeof( int ) );
+				// 				int nScreenScaleConst	= GetData<int>( pCmdBuf + 8 * sizeof( int ) );
+				// 				int nWorldToTextureConstant = GetData<int>( pCmdBuf + 9 * sizeof( int ) );
+				// 				bool bFlashlightNoLambert = GetData<int>( pCmdBuf + 10 * sizeof( int ) ) != 0;
+				// 				bool bSinglePassFlashlight = GetData<int>( pCmdBuf + 11 * sizeof( int ) ) != 0;
+				// 				pCmdBuf += 12 * sizeof( int );
+				// 
+				// 				ShaderAPITextureHandle_t hTexture = g_pShaderUtil->GetShaderAPITextureBindHandle( m_FlashlightState.m_pSpotlightTexture, m_FlashlightState.m_nSpotlightTextureFrame, 0 );
+				// 				BindTexture( (Sampler_t)nLightSampler, TEXTURE_BINDFLAGS_SRGBREAD, hTexture ); // !!!BUG!!!srgb or not?
+				// 
+				// 				SetPixelShaderConstantInternal( nAttenConst, m_pFlashlightAtten, 1, false );
+				// 				SetPixelShaderConstantInternal( nOriginConst, m_pFlashlightPos, 1, false );
+				// 
+				// 				m_pFlashlightColor[3] = bFlashlightNoLambert ? 2.0f : 0.0f; // This will be added to N.L before saturate to force a 1.0 N.L term
+				// 
+				// 				// DX10 hardware and single pass flashlight require a hack scalar since the flashlight is added in linear space
+				// 				float flashlightColor[4] = { m_pFlashlightColor[0], m_pFlashlightColor[1], m_pFlashlightColor[2], m_pFlashlightColor[3] };
+				// 				if ( ( g_pHardwareConfig->UsesSRGBCorrectBlending() ) || ( bSinglePassFlashlight ) )
+				// 				{
+				// 					// Magic number that works well on the 360 and NVIDIA 8800
+				// 					flashlightColor[0] *= 2.5f;
+				// 					flashlightColor[1] *= 2.5f;
+				// 					flashlightColor[2] *= 2.5f;
+				// 				}
+				// 
+				// 				SetPixelShaderConstantInternal( nColorConst, flashlightColor, 1, false );
+				// 
+				// 				if ( nWorldToTextureConstant >= 0 )
+				// 				{
+				// 					SetPixelShaderConstantInternal( nWorldToTextureConstant, m_FlashlightWorldToTexture.Base(), 4, false );
+				// 				}
+				// 
+				// 				BindStandardTexture( (Sampler_t)nShadowNoiseSampler, TEXTURE_BINDFLAGS_NONE, TEXTURE_SHADOW_NOISE_2D );
+				// 				if( m_pFlashlightDepthTexture && m_FlashlightState.m_bEnableShadows && ShaderUtil()->GetConfig().ShadowDepthTexture() )
+				// 				{
+				// 					ShaderAPITextureHandle_t hDepthTexture = g_pShaderUtil->GetShaderAPITextureBindHandle( m_pFlashlightDepthTexture, 0, 0 );
+				// 					BindTexture( (Sampler_t)nDepthSampler, TEXTURE_BINDFLAGS_SHADOWDEPTH, hDepthTexture );
+				// 
+				// 					SetPixelShaderConstantInternal( nDepthTweakConst, m_pFlashlightTweaks, 1, false );
+				// 
+				// 					// Dimensions of screen, used for screen-space noise map sampling
+				// 					float vScreenScale[4] = {1280.0f / 32.0f, 720.0f / 32.0f, 0, 0};
+				// 					int nWidth, nHeight;
+				// 					BaseClass::GetBackBufferDimensions( nWidth, nHeight );
+				// 
+				// 					int nTexWidth, nTexHeight;
+				// 					GetStandardTextureDimensions( &nTexWidth, &nTexHeight, TEXTURE_SHADOW_NOISE_2D );
+				// 
+				// 					vScreenScale[0] = (float) nWidth  / nTexWidth;
+				// 					vScreenScale[1] = (float) nHeight / nTexHeight;
+				// 					vScreenScale[2] = 1.0f / m_FlashlightState.m_flShadowMapResolution;
+				// 					vScreenScale[3] = 2.0f / m_FlashlightState.m_flShadowMapResolution;
+				// 					SetPixelShaderConstantInternal( nScreenScaleConst, vScreenScale, 1, false );
+				// 				}
+				// 				else
+				// 				{
+				// 					BindStandardTexture( (Sampler_t)nDepthSampler, TEXTURE_BINDFLAGS_NONE, TEXTURE_WHITE );
+				// 				}
+
+				//				Error("Flashlight unsupported\n");
+
+				pCmdBuf += 12 * sizeof( int );
+				break;
+			}
+
+		case CBCMD_SET_PIXEL_SHADER_UBERLIGHT_STATE:
+			{
+				// 				int iEdge0Const			= GetData<int>( pCmdBuf + sizeof( int ) );
+				// 				int iEdge1Const			= GetData<int>( pCmdBuf + 2 * sizeof( int ) );
+				// 				int iEdgeOOWConst		= GetData<int>( pCmdBuf + 3 * sizeof( int ) );
+				// 				int iShearRoundConst	= GetData<int>( pCmdBuf + 4 * sizeof( int ) );
+				// 				int iAABBConst			= GetData<int>( pCmdBuf + 5 * sizeof( int ) );
+				// 				int iWorldToLightConst	= GetData<int>( pCmdBuf + 6 * sizeof( int ) );
+				pCmdBuf += 7 * sizeof( int );
+				// 
+				// 				SetPixelShaderConstantInternal( iEdge0Const, m_UberlightRenderState.m_vSmoothEdge0.Base(), 1, false );
+				// 				SetPixelShaderConstantInternal( iEdge1Const, m_UberlightRenderState.m_vSmoothEdge1.Base(), 1, false );
+				// 				SetPixelShaderConstantInternal( iEdgeOOWConst, m_UberlightRenderState.m_vSmoothOneOverW.Base(), 1, false );
+				// 				SetPixelShaderConstantInternal( iShearRoundConst, m_UberlightRenderState.m_vShearRound.Base(), 1, false );
+				// 				SetPixelShaderConstantInternal( iAABBConst, m_UberlightRenderState.m_vaAbB.Base(), 1, false );
+				// 				SetPixelShaderConstantInternal( iWorldToLightConst, m_UberlightRenderState.m_WorldToLight.Base(), 4, false );
+
+				Error("Uberlight state unsupported\n");
+
+				break;
+			}
+
+#ifndef NDEBUG
+		default:
+			Assert(0);
+			break;
+#endif
+		}
+		pLastCmd = pCmd;
+	}
+}
--- a/materialsystem/ps3gcm/gcmdrawstate.h
+++ b/materialsystem/ps3gcm/gcmdrawstate.h
--- a/materialsystem/ps3gcm/gcmfunc.h
+++ b/materialsystem/ps3gcm/gcmfunc.h
@@ -0,0 +1,56 @@
+//================ Copyright (c) 1996-2010 Valve Corporation. All Rights Reserved. =================
+
+#ifndef PS3GCMFUNC_H
+#define PS3GCMFUNC_H
+
+// this is the buffer that all PPU GCM functions assume is the normal command buffer,
+// but it is not in IO-mapped memory and it's the SPU that picks up and submits it to RSX.
+// it's a level of indirection necessary to interleave SPU and PPU calls to GCM
+#define GCM_CTX gCellGcmCurrentContext
+
+
+
+#if GCM_CTX_UNSAFE_MODE
+
+#error "This mode is not supported any more. Use SPU draw mode."
+
+#endif
+
+extern int32_t SpuGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nCount );
+
+#define GCM_CTX_RESERVE( WORDS ) SpuGcmCommandBufferReserveCallback( GCM_CTX, WORDS )
+
+#define GCM_FUNC_NOINLINE( GCM_FUNCTION, ...) GCM_FUNCTION( GCM_CTX, ##__VA_ARGS__ )
+
+#ifdef _CERT
+#define GCM_PERF_RANGE( NAME )
+#define GCM_PERF_PUSH_MARKER( NAME )
+#define GCM_PERF_POP_MARKER( ) 
+#define GCM_PERF_MARKER( NAME )
+#else
+class CGcmPerfAutoRange 
+{
+public:
+	CGcmPerfAutoRange( const char * pName ){ GCM_FUNC_NOINLINE( cellGcmSetPerfMonPushMarker, pName ); }	
+	~CGcmPerfAutoRange( ){ GCM_FUNC_NOINLINE( cellGcmSetPerfMonPopMarker ); }	
+};
+#define GCM_PERF_RANGE( NAME ) CGcmPerfAutoRange _gcmAutoRange( NAME )
+#define GCM_PERF_PUSH_MARKER( NAME ) GCM_FUNC_NOINLINE( cellGcmSetPerfMonPushMarker, NAME )
+#define GCM_PERF_POP_MARKER( ) GCM_FUNC_NOINLINE( cellGcmSetPerfMonPopMarker )
+#define GCM_PERF_MARKER( NAME ) GCM_FUNC_NOINLINE( cellGcmSetPerfMonMarker, ( NAME ) )
+#endif
+
+#define GCM_FUNC( GCM_FUNCTION, ...)												   \
+{																						\
+	uint nReserveWords = GCM_FUNCTION ## MeasureSizeInline( 0, ##__VA_ARGS__ );			 \
+	GCM_CTX_RESERVE( nReserveWords );													  \
+	GCM_FUNCTION ## UnsafeInline( GCM_CTX, ##__VA_ARGS__ );									   \
+}
+
+extern void SpuGcmCommandBufferFlush();
+
+#define GCM_CTX_FLUSH_CHECKPOINT() void SpuGcmCommandBufferFlush()
+
+#define cellGcmFlush must_use_____g_ps3gcmGlobalState_CmdBufferFlush
+
+#endif
--- a/materialsystem/ps3gcm/gcmlabels.h
+++ b/materialsystem/ps3gcm/gcmlabels.h
@@ -0,0 +1,36 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Labels etc..
+//
+//==================================================================================================
+
+#ifndef INCLUDED_GCMLABELS_H
+#define INCLUDED_GCMLABELS_H
+
+enum GcmLabelEnum_t
+{
+	GCM_LABEL_QUERY_FIRST = 64,   // GCM reserves the first 64 labels, do not use them
+	GCM_LABEL_QUERY_LAST  = GCM_LABEL_QUERY_FIRST + 99, // the last query label, inclusive
+
+	GCM_LABEL_FPPATCH_RING_SEG = 252,
+	GCM_LABEL_CALL_CMD_RING_SEG = 253,		// Ring command buffer for DrawPrimUP and similar
+	GCM_LABEL_FLIP_CONTROL = 254,
+	GCM_LABEL_MEMORY_FREE = 255				// 255 is the very last possible index of a label
+};
+
+enum GcmReportEnum_t
+{
+	// Used for occlusion queries
+	GCM_REPORT_QUERY_FIRST = 0,
+	GCM_REPORT_QUERY_LAST = GCM_REPORT_QUERY_FIRST + 512,
+
+	// Used for RSX perf monitoring ... Four timestamps. Start and finish of this frame. Start and finish of previous frame
+	GCM_REPORT_TIMESTAMP_FRAME_FIRST,
+	GCM_REPORT_TIMESTAMP_FRAME_LAST = GCM_REPORT_TIMESTAMP_FRAME_FIRST + 3,
+
+	// Used for Zcull stats
+	GCM_REPORT_ZCULL_STATS_0,
+	GCM_REPORT_ZCULL_STATS_1,
+};
+
+#endif // INCLUDED_GCMLABELS_H
--- a/materialsystem/ps3gcm/gcmstate.cpp
+++ b/materialsystem/ps3gcm/gcmstate.cpp
@@ -0,0 +1,848 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Gcm renderer state and util functions
+//
+//==================================================================================================
+
+#ifndef SPU
+#define CELL_GCM_MEMCPY memcpy						// PPU SNC has no such intrinsic
+#endif
+
+#include "sys/memory.h"
+#include "sysutil/sysutil_sysparam.h"
+#include "cell/sysmodule.h"
+
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "tier1/utlbuffer.h"
+#include "cell/gcm.h"
+#include "gcmconfig.h"
+#include "ps3gcmmemory.h"
+#include "gcmstate.h"
+#include "gcmlabels.h"
+#include "gcmdrawstate.h"
+
+#include "ps3/ps3_helpers.h"
+#include <cell/gem.h> // PS3 move controller lib
+#include "inputsystem/iinputsystem.h"
+#include "memdbgon.h"
+
+//--------------------------------------------------------------------------------------------------
+// Golobals, GCM context, flip control init proto
+//--------------------------------------------------------------------------------------------------
+
+ALIGN128 CPs3gcmGlobalState			g_ps3gcmGlobalState ALIGN128_POST;
+
+ALIGN16 CellGcmContextData  gGcmContext ALIGN16_POST;
+CellGcmContextData*			gpGcmContext;
+
+CellGcmContextData			gCallContext;
+CellGcmContextData*			gpCallContext = &gCallContext;
+
+static void Gcm_InitFlipControl(void);
+
+static volatile uint32_t *s_label_call_cmd_ring_seg;	// pointer to the call cmd label
+volatile uint32_t *g_label_fppatch_ring_seg;			// Fp pacth label
+
+//--------------------------------------------------------------------------------------------------
+// Empty Ps
+//--------------------------------------------------------------------------------------------------
+
+uint8 g_dataShaderPsEmpty[] = {
+	  0x00, 0x00, 0x1B, 0x5C, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00, 0x01
+	, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x80
+	, 0x00, 0x00, 0x04, 0x18, 0x00, 0x00, 0x0A, 0xC5, 0x00, 0x00, 0x10, 0x05, 0xFF, 0xFF, 0xFF, 0xFF
+	, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50
+	, 0x00, 0x00, 0x10, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00
+	, 0x43, 0x4F, 0x4C, 0x4F, 0x52, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF
+	, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	, 0x1E, 0x7E, 0x7E, 0x00, 0xC8, 0x00, 0x1C, 0x9D, 0xC8, 0x00, 0x00, 0x01, 0xC8, 0x00, 0x00, 0x01
+	, 0x1E, 0x01, 0x01, 0x00, 0x28, 0x02, 0x1C, 0x9C, 0xC8, 0x00, 0x00, 0x01, 0xC8, 0x00, 0x00, 0x01
+	, 0x00, 0x00, 0x3F, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+//--------------------------------------------------------------------------------------------------
+// Global GCM state class
+//
+// Global state, command buffers, RSX draw display buffers etc etc 
+//--------------------------------------------------------------------------------------------------
+
+int32 CPs3gcmGlobalState::Init()
+{
+	MEM_ALLOC_CREDIT_( "GCM INIT" );
+
+	Msg(">>>> Sizeof(CGcmDrawStateDma) %d \n", DRAWSTATE_SIZEOFDMA);
+	Msg(">>>> Sizeof(CGcmDrawState) %d \n", sizeof(CGcmDrawState));
+
+	// Create Raw SPU task for renderer acceleration
+
+ 	gSpuMgr.Init(1);
+ 	gSpuMgr.CreateSpuTask("rawspu_gcmdraw_spu.self", &m_spuHandle);
+
+	// Default to 60Hz
+
+	m_flipMode = 30;
+
+	// Video : display res, video buffer, gamma, RGB colour range
+
+	if( int nError= InitVideo() )
+		return nError;
+
+	// Alloc IO memory, Set address, size of main memory pool for RSX
+
+	CreateIoBuffers();
+
+	// Init GCM : Map IO memory, Create command buffers
+	if( int nError = InitGcm() )
+		return nError;
+
+	// Retrieve RSX local memory config
+	CellGcmConfig rsxConfig;
+	cellGcmGetConfiguration( &rsxConfig );
+	m_pLocalBaseAddress = rsxConfig.localAddress;
+	m_nLocalSize = rsxConfig.localSize;
+	cellGcmAddressToOffset( m_pLocalBaseAddress, &m_nLocalBaseOffset );
+	Assert( m_nLocalBaseOffset == 0 );
+
+	// Init local memory mgr
+	Ps3gcmLocalMemoryAllocator_Init();
+
+	// Create display buffers etc..
+	CreateRsxBuffers();
+
+	// Create Empty PS
+
+	m_pShaderPsEmpty = reinterpret_cast< CgBinaryProgram * >( g_dataShaderPsEmpty );
+	m_pShaderPsEmptyBuffer.Alloc( kAllocPs3GcmShader, m_pShaderPsEmpty->ucodeSize );
+	V_memcpy( m_pShaderPsEmptyBuffer.DataInLocalMemory(), ( (char*)m_pShaderPsEmpty ) + m_pShaderPsEmpty->ucode, m_pShaderPsEmpty->ucodeSize );
+
+
+	CgBinaryFragmentProgram *pCgFragmentProgram = (  CgBinaryFragmentProgram * )( uintp( m_pShaderPsEmpty ) + m_pShaderPsEmpty->program );
+	m_nPsEmptyAttributeInputMask = pCgFragmentProgram->attributeInputMask;
+
+	uint registerCount = pCgFragmentProgram->registerCount;
+	// NOTE: actual register count can be modified by specifying an artificial e.g. PS3REGCOUNT48 static combo to force it to 48
+	Assert( registerCount <= 48 );
+	if (registerCount < 2)
+	{
+		// register count must be [2, 48]
+		registerCount = 2;
+	}
+
+	uint8_t controlTxp = CELL_GCM_FALSE;
+	uint32 shCtrl0 = ( CELL_GCM_COMMAND_CAST( controlTxp ) << CELL_GCM_SHIFT_SET_SHADER_CONTROL_CONTROL_TXP ) 
+		& CELL_GCM_MASK_SET_SHADER_CONTROL_CONTROL_TXP;
+	shCtrl0 |= ( 1<<10 ) | ( registerCount << 24 );
+	shCtrl0 |= pCgFragmentProgram->depthReplace ? 0xE : 0x0;
+	shCtrl0 |= pCgFragmentProgram->outputFromH0 ? 0x00 : 0x40;
+	shCtrl0 |= pCgFragmentProgram->pixelKill ? 0x80 : 0x00;
+	m_nPsEmptyShaderControl0 = shCtrl0;
+
+	// Init glip control
+	m_fastFlip = 0;
+	Gcm_InitFlipControl();
+
+    // Address of draw states
+    m_eaDrawStates = uintp(gGcmDrawState);
+
+	// Give SPU program this class
+	gSpuMgr.WriteMailbox(&m_spuHandle, uintp(this));
+
+
+	return CELL_OK;
+}
+
+void CPs3gcmGlobalState::CreateIoBuffers()
+{
+	m_nIoSize = GCM_IOSIZE;
+	if ((m_nIoSize & 0xFFFFF) != 0)			// MB aligned
+	{
+		Error("No MB alignment %x\n\n", m_nIoSize);
+	}
+
+	// Try to allocate main memory that will be mapped to IO address space
+	// Actually mapped in in GcmInit, once gcm is going
+
+	sys_addr_t pIoAddress = NULL;
+	int nError = sys_memory_allocate( m_nIoSize, SYS_MEMORY_PAGE_SIZE_1M, &pIoAddress );
+	if ( CELL_OK != nError )
+	{
+		Error( "sys_memory_allocate failed to allocate %d bytes (err: %d)\n", m_nIoSize, nError );
+	}
+	m_pIoAddress = (void *)pIoAddress;
+
+	Msg( "======== GCM IO memory allocated  @0x%p   size = %d MB ========\n", m_pIoAddress, m_nIoSize / 1024 / 1024 );
+
+	// Call command buffer
+
+	m_pCallCmdBuffer = (void*)(uintp(pIoAddress) + GCM_DEFCMDBUFFSIZE);
+
+	// RSX main memory pool buffer
+	m_nRsxMainMemoryPoolBufferSize = GCM_MAINPOOLSIZE; 
+	m_pRsxMainMemoryPoolBuffer = (void*)(uintp(pIoAddress) + GCM_DEFCMDBUFFSIZE + GCM_CALLCMDBUFFSIZE);
+
+	// Patch buffers
+
+	m_pPatchBuff = (uint8*)m_pRsxMainMemoryPoolBuffer + GCM_MAINPOOLSIZE;
+}
+
+int  CPs3gcmGlobalState::InitGcm()
+{
+	int32 result = cellGcmInit( GCM_DEFCMDBUFFSIZE, m_nIoSize, m_pIoAddress );
+	if ( result < CELL_OK )
+		return result;
+
+	gGcmContext = *gCellGcmCurrentContext;
+	gpGcmContext = &gGcmContext;
+	gpGcmContext->callback = CmdBufferFull;
+
+	// Set the flip mode etc...
+
+	// Get the offset delta
+	cellGcmAddressToOffset( m_pIoAddress, &m_nIoOffsetDelta );
+	m_nIoOffsetDelta -= uintp( m_pIoAddress );
+
+	// Setup call cmd buffer
+
+	m_nCallCmdBufferoffset =  uintp(m_pCallCmdBuffer) + m_nIoOffsetDelta;
+
+	m_nCallWritePos = 0;
+	m_nCallReadSegment = 0;
+	s_label_call_cmd_ring_seg = cellGcmGetLabelAddress(GCM_LABEL_CALL_CMD_RING_SEG);
+	*s_label_call_cmd_ring_seg = 0;
+
+	// Setup Patch Buffers
+
+	m_nPatchIdx = 0;
+	m_nPatchReadSeg = 0;
+	g_label_fppatch_ring_seg = cellGcmGetLabelAddress(GCM_LABEL_FPPATCH_RING_SEG);
+	*g_label_fppatch_ring_seg = 0;
+
+	return CELL_OK;
+
+}
+
+int  CPs3gcmGlobalState::InitVideo()
+{
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Initialize m_display
+	//
+	CellVideoOutState videoOutState;
+	int result = cellVideoOutGetState( CELL_VIDEO_OUT_PRIMARY, 0, &videoOutState);
+	if ( result < CELL_OK )
+		return result;
+
+	CellVideoOutResolution resolution;
+	result = cellVideoOutGetResolution( videoOutState.displayMode.resolutionId, &resolution );
+	if ( result < CELL_OK )
+		return result;
+
+	// Always output scanout in system m_display resolution
+	m_nRenderSize[0] = resolution.width;
+	m_nRenderSize[1] = resolution.height;
+
+	// Handle special case: 1080p will be upsampled from 720p
+	if ( resolution.height >= 720 && CommandLine()->FindParm( "-480p" ) )
+	{
+		m_nRenderSize[0] = 640;
+		m_nRenderSize[1] = 480;
+		videoOutState.displayMode.resolutionId = CELL_VIDEO_OUT_RESOLUTION_480;
+	}
+	else if ( resolution.height >= 1080 && !CommandLine()->FindParm( "-1080p" ) )
+	{
+		m_nRenderSize[0] = 1280;
+		m_nRenderSize[1] = 720;
+		videoOutState.displayMode.resolutionId = CELL_VIDEO_OUT_RESOLUTION_720;
+	}
+
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Set video output
+	//
+	CellVideoOutConfiguration videocfg;
+	memset( &videocfg, 0, sizeof(videocfg) );
+	videocfg.resolutionId = videoOutState.displayMode.resolutionId;
+	videocfg.format = CELL_VIDEO_OUT_BUFFER_COLOR_FORMAT_X8R8G8B8;
+	videocfg.pitch = cellGcmGetTiledPitchSize( m_nRenderSize[0] * 4 );
+	m_nSurfaceRenderPitch = videocfg.pitch;
+
+	// Configure video output
+	result = cellVideoOutConfigure( CELL_VIDEO_OUT_PRIMARY, &videocfg, NULL, 0 );
+	if ( result < CELL_OK )
+		return result;
+
+	// Get the new video output
+	result = cellVideoOutGetState( CELL_VIDEO_OUT_PRIMARY, 0, &videoOutState );
+	if ( result < CELL_OK )
+		return result;
+	m_flRenderAspect = ( videoOutState.displayMode.aspect == CELL_VIDEO_OUT_ASPECT_4_3 ) ? ( 4.0f/3.0f ) : ( 16.0f / 9.0f );
+
+	// Set the gamma to deal with TV's having a darker gamma than computer monitors
+	result = cellSysmoduleLoadModule( CELL_SYSMODULE_AVCONF_EXT );
+	if ( result == CELL_OK )
+	{
+		cellVideoOutSetGamma( CELL_VIDEO_OUT_PRIMARY, 2.2f / 2.5f );
+	}
+	else
+	{
+		Warning( "***** ERROR calling cellSysmoduleLoadModule( CELL_SYSMODULE_AVCONF_EXT )! Gamma not set!\n" );
+		return result;
+	}
+
+	// Output video color settings
+	CellVideoOutDeviceInfo info;
+	cellVideoOutGetDeviceInfo( CELL_VIDEO_OUT_PRIMARY, 0, &info );
+	if ( info.rgbOutputRange == CELL_VIDEO_OUT_RGB_OUTPUT_RANGE_LIMITED )
+	{
+		DevMsg( "***** Video Out - Limited Range (16-235) - Gamma=%d *****\n", info.colorInfo.gamma );
+	}
+	else
+	{
+		DevMsg( "***** Video Out - Full Range (0-255) - Gamma=%d *****\n", info.colorInfo.gamma );
+	}
+
+	return CELL_OK;
+}
+
+void CPs3gcmGlobalState::CreateRsxBuffers()
+{
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Create automatic display objects
+	//
+	if( m_nSurfaceRenderPitch != cellGcmGetTiledPitchSize( m_nRenderSize[0] * 4 ) )
+	{
+		Error("Pre-computed surface render pitch %u != %u = cellGcmGetTiledPitchSize( %u * 4 ) ", m_nSurfaceRenderPitch, cellGcmGetTiledPitchSize( m_nRenderSize[0] * 4 ), m_nRenderSize[0] );
+	}
+
+	m_display.surfaceFlipIdx = 0;
+
+	// Color buffers
+	for ( int k = 0; k < ARRAYSIZE( m_display.surfaceColor ); ++ k )
+	{
+		uint32 nRenderSize32bpp = GetRenderSurfaceBytes(); // 32-line vertical alignment required in local memory
+		m_display.surfaceColor[k].Alloc( kAllocPs3gcmColorBufferFB, nRenderSize32bpp );
+		cellGcmSetDisplayBuffer( k, m_display.surfaceColor[k].Offset(), m_nSurfaceRenderPitch, m_nRenderSize[0], m_nRenderSize[1] );
+	}
+
+	// Depth buffer
+	{
+		uint32 zcullSize[2] = { AlignValue( m_nRenderSize[0], 64 ), AlignValue( m_nRenderSize[1], 64 ) };
+		uint32 nDepthPitch = cellGcmGetTiledPitchSize( zcullSize[0] * 4 );
+		uint32 uDepthBufferSize32bpp = nDepthPitch * zcullSize[1];
+		uDepthBufferSize32bpp = AlignValue( uDepthBufferSize32bpp, PS3GCMALLOCATIONALIGN( kAllocPs3gcmDepthBuffer ) );
+		m_display.surfaceDepth.Alloc( kAllocPs3gcmDepthBuffer, uDepthBufferSize32bpp );
+
+		uint32 uiZcullIndex = m_display.surfaceDepth.ZcullMemoryIndex();
+		cellGcmBindZcull( uiZcullIndex,
+			m_display.surfaceDepth.Offset(),
+			zcullSize[0], zcullSize[1],
+			m_display.surfaceDepth.ZcullMemoryStart(),
+			CELL_GCM_ZCULL_Z24S8,
+			CELL_GCM_SURFACE_CENTER_1,
+			CELL_GCM_ZCULL_LESS,
+			CELL_GCM_ZCULL_LONES,
+			CELL_GCM_SCULL_SFUNC_ALWAYS,
+			0, 0	// sRef, sMask
+			);
+
+		uint32 uiTileIndex = m_display.surfaceDepth.TiledMemoryIndex();
+		cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL, m_display.surfaceDepth.Offset(),
+			uDepthBufferSize32bpp, m_nSurfaceRenderPitch, CELL_GCM_COMPMODE_Z32_SEPSTENCIL_REGULAR,
+			m_display.surfaceDepth.TiledMemoryTagAreaBase(), // The area base + size/0x10000 will be allocated as the tag area.
+			3 );	// Default depth buffer on bank 3
+		cellGcmBindTile( uiTileIndex );
+	}
+}
+
+void  CPs3gcmGlobalState::Shutdown()
+{
+	
+	gpGcmDrawState->EndFrame();
+	gpGcmDrawState->CmdBufferFinish();
+
+	cellGcmSetFlipHandler(NULL);
+	cellGcmSetVBlankHandler(NULL);
+	
+	cellSysmoduleUnloadModule( CELL_SYSMODULE_AVCONF_EXT );
+}
+
+//--------------------------------------------------------------------------------------------------
+// DawPrimUp code...
+//--------------------------------------------------------------------------------------------------
+
+
+uint32 CPs3gcmGlobalState::DrawPrimitiveUP(D3DPRIMITIVETYPE nPrimitiveType,UINT nPrimitiveCount,
+										CONST void *pVertexStreamZeroData, UINT nVertexStreamZeroStride )
+{
+	// First Determine size required for this call
+
+	uint32 size = 0;
+
+	uint32 nIndexCount = GetGcmCount( nPrimitiveType, nPrimitiveCount );
+	uint32 nDataWords = ( nVertexStreamZeroStride * nIndexCount + 3 ) / sizeof( uint32 );
+
+	size = cellGcmSetWriteTextureLabelMeasureSize(size, GCM_LABEL_CALL_CMD_RING_SEG, 0 );
+	size = cellGcmSetInvalidateVertexCacheMeasureSize(size);
+	size = cellGcmSetDrawInlineArrayMeasureSize(size, GetGcmMode( nPrimitiveType ), nDataWords, pVertexStreamZeroData );
+	size = cellGcmSetReturnCommandMeasureSize(size);
+
+	size *=4;
+
+	// Check there is no space in the current segment
+
+	uint32 endPos, nextSeg, readSeg, writeSeg;
+
+	endPos = m_nCallWritePos + size;
+	writeSeg = m_nCallWritePos/GCM_CALLCMDSEGSIZE;
+
+	if ((endPos/GCM_CALLCMDSEGSIZE) != writeSeg)
+	{
+		// Move to the next segment
+
+		uint32 nextSeg = (writeSeg + 1) % (GCM_CALLCMDBUFFSIZE / GCM_CALLCMDSEGSIZE);
+
+		// Wait for RSX to not be in this segment
+
+		readSeg = m_nCallReadSegment;
+
+		if(nextSeg == readSeg) readSeg = *s_label_call_cmd_ring_seg;
+
+		gpGcmDrawState->CmdBufferFlush();
+
+		uint32 spins = 0;
+		while(nextSeg == readSeg)
+		{
+			spins++;	
+			sys_timer_usleep(60);
+			readSeg = *s_label_call_cmd_ring_seg;
+		}
+
+		//if (spins > 1) Msg("Spins %d\n", spins);
+
+		// Move to next segmend abnd record new readSeg
+
+		m_nCallWritePos = (nextSeg * GCM_CALLCMDSEGSIZE);
+		writeSeg = nextSeg;
+
+		m_nCallReadSegment = readSeg;
+
+//		Msg("new Segment 0x%x\n", m_nCallWritePos);
+
+	}
+
+	uint32 ret = m_nCallWritePos + uintp(m_pCallCmdBuffer);
+
+	// Write Data
+	// Setup a context to do so 
+
+	CellGcmContextData context;
+
+	context.begin   = (uint32*)m_pCallCmdBuffer;
+	context.current = (uint32*)((uint8*)m_pCallCmdBuffer + m_nCallWritePos);
+	context.end		= (uint32*)((uint8*)m_pCallCmdBuffer + GCM_CALLCMDBUFFSIZE);
+	context.callback = 0;
+
+	cellGcmSetWriteTextureLabelUnsafeInline(&context, GCM_LABEL_CALL_CMD_RING_SEG, writeSeg );
+	cellGcmSetInvalidateVertexCacheUnsafeInline(&context);
+	cellGcmSetDrawInlineArrayUnsafeInline(&context, GetGcmMode( nPrimitiveType ), nDataWords, pVertexStreamZeroData );
+	cellGcmSetReturnCommandUnsafeInline(&context);
+
+	// Update pointers
+
+	m_nCallWritePos += size;
+
+	return ret;
+}
+
+
+
+//--------------------------------------------------------------------------------------------------
+// Command Buffer callback
+//--------------------------------------------------------------------------------------------------
+
+#define SEGSIZE 0x40000
+#define SEGMASK 0x3FFFF
+
+int32 CPs3gcmGlobalState::CmdBufferFull(struct CellGcmContextData * pGcmContext, uint32_t size)
+{
+	// move to next SEGSIZE, and then wrap to start
+	// Determine where the next buffer will be
+
+	uint32 nIoAddress = (uint32)g_ps3gcmGlobalState.m_pIoAddress;
+
+	uint32 nextBufferStart = ((uint32)pGcmContext->begin + SEGSIZE) & (~SEGMASK);
+	nextBufferStart -= nIoAddress;
+
+	nextBufferStart &= (GCM_DEFCMDBUFFSIZE-1);
+	nextBufferStart = nextBufferStart ? (nextBufferStart + nIoAddress) : (SEGSIZE + nIoAddress);
+
+	// Flush RSX to this point
+	cellGcmFlushUnsafeInline(pGcmContext);
+
+	// put jump command to beginning of next buffer
+	uint32	nextBufferOffset	   = nextBufferStart - nIoAddress;
+	uint32  nextBufferEndOffset    = ((nextBufferOffset + SEGSIZE) & (~SEGMASK)) - 4;
+
+	cellGcmSetJumpCommandUnsafeInline(pGcmContext, nextBufferStart - nIoAddress );
+
+	// get put/get/ref register address
+	volatile CellGcmControl* control = cellGcmGetControlRegister();
+
+	int count = 500000;
+
+	// wait for RSX to finish all commands in next buffer (it's a ring buffer)
+	volatile uint32_t get = (volatile uint32_t)control->get;
+
+	while( (get < 0x1000 ) ||  ( (get >= nextBufferOffset) && (get < nextBufferEndOffset) ) )
+	{
+		sys_timer_usleep( 30 );
+		get = (volatile uint32_t)control->get;
+
+// 		count--;
+// 		if (count < 1)
+// 		{
+// 			Msg("\n*****>>>> CmdBufferFull : get 0x%x : nextBufferOffset 0x%x : nextBufferEndOffset 0x%x\n", get, nextBufferOffset, nextBufferEndOffset );
+// 			count = 1;
+// 		}
+
+	}
+
+
+	// Set Command buffer context struct
+
+	pGcmContext->begin = (uint32*)nextBufferStart;
+	pGcmContext->end   = (uint32*)(nextBufferEndOffset + nIoAddress);
+	pGcmContext->current = (uint32*)nextBufferStart;
+
+	return CELL_OK;
+}
+
+//--------------------------------------------------------------------------------------------------
+// Flip Control
+//
+// Summary : 
+//
+// Label used to cap the framerate. ie label to ensure flips no faster than 1 (60hz) or 2 (30Hz) vblanks.
+// PPU blocks if previous flip not complete, so can't run too far ahead
+// vblanks and flips noted by callbacks
+//--------------------------------------------------------------------------------------------------
+
+enum {
+
+	LABEL_FLIP_CONTROL_READY=1, // when label-before-flip is released
+	LABEL_FLIP_CONTROL_WAIT,	// when label-before-flip is not released
+	/*
+	  label_flip_control:
+	  LABEL_FLIP_CONTROL_WAIT
+	  => (when releasing flip by ppu) => LABEL_FLIP_CONTROL_READY,
+	  => (when flip is finished by rsx) =>  LABEL_FLIP_CONTROL_WAIT,
+	*/
+
+
+	FLIP_STATE_V1=1,
+	FLIP_STATE_FLIP_RELEASED,
+	FLIP_STATE_FLIPPED,
+	/*
+	  flip_status sequence (30fps or slower):
+	  FLIP_STATE_FLIPPED
+	  (at vblank callback) => FLIP_STATE_V1
+	  (at vblank callback) =<release flip>=> FLIP_STATE_FLIP_RELEASED
+	  (at flip callback) => FLIP_STATE_FLIPPED
+	*/
+
+	/*
+	  flip_status sequence (60fps or slower):
+	  FLIP_STATE_FLIPPED
+	  (at vblank callback) =<release flip>=> FLIP_STATE_FLIP_RELEASED
+	  (at flip callback) => FLIP_STATE_FLIPPED
+	*/
+};
+
+static volatile uint32_t *s_label_flip_control;  // pointer to the flip control label
+static int s_flip_status=FLIP_STATE_FLIPPED;		// status variable to control flip
+
+//--------------------------------------------------------------------------------------------------
+
+static bool Gcm_ReleaseFlip(void)
+{
+	if (*s_label_flip_control==LABEL_FLIP_CONTROL_READY) {
+		/* just in case rsx is running very slow somehow */
+		/* and flip_control label is not updated even after the real flip */
+		return false;
+	}
+
+	*s_label_flip_control=LABEL_FLIP_CONTROL_READY;
+	return true;
+}
+
+void updateCursorPosition(const int pixelX, const int pixelY)
+{
+	cellGcmSetCursorPosition(pixelX, pixelY);
+	int32_t result = cellGcmUpdateCursor();
+	if( result == CELL_GCM_ERROR_FAILURE)
+	{
+		// [dkorus] this case happens until we initialize the cursor
+
+		//Msg(" hardware cursor error: cellGcmInitCursor() has not been called\n");
+	}
+	else if( result == CELL_GCM_ERROR_INVALID_VALUE )
+	{
+		Msg(" hardware cursor error: cursor bitmap is not correctly set\n");
+	}
+}
+
+void enableCursor()
+{
+	if (cellGcmSetCursorEnable() != CELL_OK )
+	{
+		Msg( "Hardware Cursor Error: trouble with enable\n" );
+	}
+
+	if ( cellGcmUpdateCursor() != CELL_OK )
+	{	
+		Msg( "Hardware Cursor Error: trouble with update\n" );
+	}
+}
+
+static void Gcm_VblankCallbackFunction(const uint32_t head)
+{
+	// unused arg
+	(void)head; 
+	
+	int pixelX, pixelY;
+	if ( g_pInputSystem )
+	{
+		bool cursorEnabled = g_pInputSystem->GetPS3CursorPos( pixelX, pixelY );
+
+		if( cursorEnabled )
+		{
+			updateCursorPosition(pixelX,pixelY);
+		}
+	}
+
+
+	switch (s_flip_status){
+	  case FLIP_STATE_FLIPPED:
+		  if (g_ps3gcmGlobalState.m_flipMode == 30){
+			  s_flip_status=FLIP_STATE_V1;
+		  } else if (g_ps3gcmGlobalState.m_flipMode == 60){
+			  if (Gcm_ReleaseFlip()){
+				  s_flip_status=FLIP_STATE_FLIP_RELEASED;
+			  }
+		  }
+		  break;
+	  case FLIP_STATE_V1:
+		  if (Gcm_ReleaseFlip()){
+			  s_flip_status=FLIP_STATE_FLIP_RELEASED;
+		  }
+		  break;
+	  case FLIP_STATE_FLIP_RELEASED:
+		  break;
+	  default:
+		  assert(0);
+	}
+}
+
+static void Gcm_FlipCallbackFunction(const uint32_t head)
+{
+	(void)head;
+	switch (s_flip_status){
+	  case FLIP_STATE_FLIP_RELEASED:
+		  s_flip_status=FLIP_STATE_FLIPPED;
+		  break;
+	  default:
+		  break;
+	}
+}
+
+
+// initialize flip control state machine
+static void Gcm_InitFlipControl(void)
+{
+	cellGcmSetFlipMode( CELL_GCM_DISPLAY_HSYNC );
+
+	g_ps3gcmGlobalState.m_frameNo = 0;
+	g_ps3gcmGlobalState.m_finishIdx = 0;
+	
+	s_label_flip_control=cellGcmGetLabelAddress(GCM_LABEL_FLIP_CONTROL);
+	*s_label_flip_control=LABEL_FLIP_CONTROL_WAIT;
+
+	cellGcmSetFlipHandler(Gcm_FlipCallbackFunction);
+	cellGcmSetVBlankHandler(Gcm_VblankCallbackFunction);
+
+}
+
+//--------------------------------------------------------------------------------------------------
+// Beginscene, endscene and flip
+//--------------------------------------------------------------------------------------------------
+
+uint32 gCmdBufferHighWater = 0;
+uint32 gCmdBufferStart = 0;
+
+void  CPs3gcmGlobalState::BeginScene()
+{
+	gCmdBufferStart = (uint32)gpGcmContext->current;
+
+	gpGcmDrawState->BeginScene();
+}
+
+void  CPs3gcmGlobalState::EndScene()
+{	
+	if ( (uint32)gpGcmContext->current > gCmdBufferStart )
+	{
+		uint32 bytes = (uint32)gpGcmContext->current - gCmdBufferStart;
+		if (bytes > gCmdBufferHighWater ) gCmdBufferHighWater = bytes;
+	}
+
+	gpGcmDrawState->EndScene();
+}
+
+float g_fliptime = 0;
+
+void CPs3gcmGlobalState::SetFastFlip(bool onoff)
+{
+	m_fastFlip = onoff;
+	
+	g_fliptime = Plat_FloatTime();
+}
+
+extern void OnFrameTimestampAvailableRsx( float ms );
+
+void  CPs3gcmGlobalState::Flip()
+{
+	cellSysutilCheckCallback();
+
+	if(m_fastFlip)
+	{
+		Gcm_ReleaseFlip();
+
+		float time = Plat_FloatTime();
+
+		if ( (time - g_fliptime) > 0.05) goto fullflip;
+
+		// Just end the frame, no point in flipping here...
+
+		gpGcmDrawState->EndFrame();
+		GCM_FUNC( cellGcmFlush );
+
+		goto newframe;
+	}
+
+fullflip:
+
+	int idx, startIdx, endIdx;
+
+	//--------------------------------------------------------------------------------------------------
+	// Ensure any buffered state, copies etc... goes to GPU
+	//--------------------------------------------------------------------------------------------------
+
+	gpGcmDrawState->EndFrame();
+
+	//--------------------------------------------------------------------------------------------------
+	// Wait for previous frame Flip
+	//--------------------------------------------------------------------------------------------------
+
+	while (cellGcmGetFlipStatus()!=0){
+
+		g_pGcmSharedData->CheckForAudioRequest();
+		g_pGcmSharedData->CheckForServerRequest();
+
+		sys_timer_usleep(300);
+	}
+
+	// Insert end of gpu timestamp
+
+	idx = m_frameNo&1;
+	endIdx = GCM_REPORT_TIMESTAMP_FRAME_FIRST + (idx*2) + 1;
+	GCM_FUNC( cellGcmSetTimeStamp, endIdx );
+
+	//--------------------------------------------------------------------------------------------------
+	// If requested, lets defrag VRAM
+	//--------------------------------------------------------------------------------------------------
+
+	if (g_pGcmSharedData->m_bDeFrag)
+	{
+		g_pGcmSharedData->m_bDeFrag = 0;
+		extern void Ps3gcmLocalMemoryAllocator_CompactWithReason( char const *szReason );
+		Ps3gcmLocalMemoryAllocator_CompactWithReason( "End of Round" );
+
+	}
+
+	//--------------------------------------------------------------------------------------------------
+	// Get Timestamps
+	//--------------------------------------------------------------------------------------------------
+
+	if (m_frameNo)
+	{
+		idx = ((m_frameNo-1) & 1);
+		startIdx = GCM_REPORT_TIMESTAMP_FRAME_FIRST + (idx*2);
+		endIdx = startIdx+1;
+
+		uint64 uiStartTimestamp = cellGcmGetTimeStamp( startIdx );
+		uint64 uiEndTimestamp = cellGcmGetTimeStamp( endIdx );
+
+		uint64 uiRsxTimeInNanoSeconds = uiEndTimestamp - uiStartTimestamp;
+		OnFrameTimestampAvailableRsx( uiRsxTimeInNanoSeconds / 1000000.0f );
+	}
+
+	//--------------------------------------------------------------------------------------------------
+	// Insert new flip command and flush gpu
+	//--------------------------------------------------------------------------------------------------
+
+	// reset FlipStatus = 1
+	cellGcmResetFlipStatus();
+
+	// queue Flip command
+	GCM_FUNC( cellGcmSetFlipWithWaitLabel, m_display.surfaceFlipIdx, GCM_LABEL_FLIP_CONTROL, LABEL_FLIP_CONTROL_READY);
+	m_display.Flip();
+
+	GCM_FUNC( cellGcmSetWriteCommandLabel, GCM_LABEL_FLIP_CONTROL, LABEL_FLIP_CONTROL_WAIT);
+
+	GCM_FUNC( cellGcmSetWaitFlip );
+	GCM_FUNC( cellGcmFlush );
+
+	extern void Ps3gcmLocalMemoryAllocator_Reclaim();
+	Ps3gcmLocalMemoryAllocator_Reclaim();
+
+	//--------------------------------------------------------------------------------------------------
+	// Start a new frame
+	//--------------------------------------------------------------------------------------------------
+
+newframe:
+
+	m_frameNo ++;
+
+	// Insert start of gpu timestamp
+	idx = m_frameNo&1;
+	startIdx = GCM_REPORT_TIMESTAMP_FRAME_FIRST + (idx*2);
+	GCM_FUNC( cellGcmSetTimeStamp, startIdx );
+
+	// Put RSX into known state for start of frame
+	gpGcmDrawState->ResetRsxState();
+
+	// Moved from DX present()
+	GCM_FUNC( cellGcmSetInvalidateVertexCache );
+}	
+
+//--------------------------------------------------------------------------------------------------
+// Buffer management
+//--------------------------------------------------------------------------------------------------
+
+CPs3gcmBuffer * CPs3gcmBuffer::New( uint32 uiSize, CPs3gcmAllocationType_t uType )
+{
+	CPs3gcmBuffer * p = new CPs3gcmBuffer;
+	p->m_lmBlock.Alloc( uType, uiSize );
+	return p;
+}
+
+void CPs3gcmBuffer::Release()
+{
+	// Wait for RSX to finish using the buffer memory
+	// and free it later
+	m_lmBlock.Free();
+	delete this;
+}
+
--- a/materialsystem/ps3gcm/gcmstate.h
+++ b/materialsystem/ps3gcm/gcmstate.h
@@ -0,0 +1,272 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Gcm renderer state and util functions
+//
+//==================================================================================================
+
+#ifndef INCLUDED_GCMSTATE_H
+#define INCLUDED_GCMSTATE_H
+
+#ifndef SPU
+
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "cell\gcm.h"
+#include "gcmconfig.h"
+#include "ps3gcmmemory.h"
+#include "dxabstract_def.h"
+
+#include "spumgr_ppu.h"
+
+#else
+
+#include "spumgr_spu.h"
+
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// Misc
+//--------------------------------------------------------------------------------------------------
+
+template <typename T>
+inline T Min( T a, T b )
+{
+	return a < b ? a : b;
+}
+
+template <typename T>
+inline T Max( T a, T b )
+{
+	return a > b ? a : b;
+}
+
+template <typename T>
+inline void Swap( T& a , T & b )
+{
+	T c = a; a = b; b = c;
+}
+
+//--------------------------------------------------------------------------------------------------
+// Literals
+//--------------------------------------------------------------------------------------------------
+
+// IO Memory (page isze is 1MB, so make these add up to 1MB
+
+#define GCM_MAINPOOLSIZE    (0 * 0x100000)		    // IO memory for main pool
+#define GCM_DEFCMDBUFFSIZE	(1 * 0x200000)			// Default command buff (must be pow 2)
+
+#define GCM_CALLCMDBUFFSIZE (2 * 0x10000)			// 256 K of cmd buffer to call to
+													// Used for DrawprimUP													
+#define GCM_CALLCMDSEGSIZE  0x8000					// 32K segmentation
+
+#define GCM_PATCHBUFFSIZE   ((2 * 0x100000) - GCM_CALLCMDBUFFSIZE)
+#define GCM_PATCHSEGSIZE    0x8000					
+
+#define GCM_IOSIZE (GCM_MAINPOOLSIZE + GCM_DEFCMDBUFFSIZE + GCM_CALLCMDBUFFSIZE + GCM_PATCHBUFFSIZE)
+
+//--------------------------------------------------------------------------------------------------
+// Display Structure
+//--------------------------------------------------------------------------------------------------
+
+struct CPs3gcmDisplay
+{
+	uint32 surfaceFlipIdx;	// which scanout color buffer will be presented with flip
+	enum EnumConst_t { SURFACE_COUNT = 2 };
+	CPs3gcmLocalMemoryBlockSystemGlobal surfaceColor[SURFACE_COUNT];	// scanout color buffers for double-buffering 
+	// (need one more to avoid overwriting old buffer)
+	CPs3gcmLocalMemoryBlockSystemGlobal surfaceDepth;					// depth buffer
+
+	void Flip()
+	{
+		surfaceFlipIdx = NextSurfaceIndex();
+	}
+
+	uint NextSurfaceIndex( int nFrame = 1 )const
+	{
+		return ( surfaceFlipIdx + nFrame ) % SURFACE_COUNT;
+	}
+
+	uint PrevSurfaceIndex( int nFrame )const
+	{
+		int nResult = int( surfaceFlipIdx + 1000000 * SURFACE_COUNT - nFrame ) % int( SURFACE_COUNT );
+		Assert( uint( nResult ) < SURFACE_COUNT ); // if this is negative, it means we did ( ( something ) mod 2 ) mod 3, which makes no sense in this context
+		return uint( nResult );
+	}
+};
+
+//--------------------------------------------------------------------------------------------------
+// Global GCM state class
+//--------------------------------------------------------------------------------------------------
+
+struct CPs3gcmGlobalState
+{
+	//--------------------------------------------------------------------------------------------------
+	// Memory
+	// RSX Local, plus one block of memory mapped into RSX (IO mem)
+	// Main memory pool is within the IO mem and is used for textures until it fills...
+	//--------------------------------------------------------------------------------------------------
+
+	// RSX local memory
+	void * m_pLocalBaseAddress;	// RSX Local Memory Base Address
+	uint32 m_nLocalBaseOffset;	// cellGcmAddressToOffset( m_pLocalBaseAddress )
+	uint32 m_nLocalSize;		// RSX Local Memory Size
+
+	// IO memory mapped into RSX
+	void * m_pIoAddress;		// RSX IO buffer, base address
+	uint32 m_nIoSize;			// RSX IO total size [including CMD buffer]
+	uint32 m_nIoOffsetDelta;    // add this to EA to get Io Offset
+
+	// Call Cmd Buffer
+	void*  m_pCallCmdBuffer;
+	uint32 m_nCallCmdBufferoffset;
+	uint32 m_nCallWritePos;				// Current posn (offset)
+	uint32 m_nCallReadSegment;
+
+	// main memory pool buffer
+	void * m_pRsxMainMemoryPoolBuffer;
+	uint32 m_nRsxMainMemoryPoolBufferSize;
+
+    // Pointer to the draw states
+    uint32  m_eaDrawStates;
+
+	//--------------------------------------------------------------------------------------------------
+	// SPU Task
+	//--------------------------------------------------------------------------------------------------
+
+	SpuTaskHandle				m_spuHandle;
+	
+	//--------------------------------------------------------------------------------------------------
+	// Patched Shaders
+	//--------------------------------------------------------------------------------------------------
+
+	uint8*						m_pPatchBuff;
+	uint32						m_nPatchIdx;			// Write index for this frames patch buffer
+	uint32						m_nPatchReadSeg;
+
+	//--------------------------------------------------------------------------------------------------
+	// Empty pixel shader
+	//--------------------------------------------------------------------------------------------------
+
+	CPs3gcmLocalMemoryBlock		m_pShaderPsEmptyBuffer;
+	CgBinaryProgram				*m_pShaderPsEmpty;	// empty pixel shader
+	uint32						m_nPsEmptyShaderControl0;
+	uint32						m_nPsEmptyAttributeInputMask;
+
+
+	//--------------------------------------------------------------------------------------------------
+	// Flip data
+	//--------------------------------------------------------------------------------------------------
+
+	uint32						m_flipMode;				// Holds 30 or 60
+	uint32						m_frameNo;
+	uint32						m_finishIdx;
+	bool						m_fastFlip;
+
+	//--------------------------------------------------------------------------------------------------
+	// Display
+	//--------------------------------------------------------------------------------------------------
+
+	// Display size, aspect, pitch
+	uint16 						m_nRenderSize[2];	// with & height of the render buffer
+	float  						m_flRenderAspect;	// aspect ratio of the output device
+	uint32 						m_nSurfaceRenderPitch;
+	
+	CPs3gcmDisplay				m_display;
+
+	//--------------------------------------------------------------------------------------------------
+	// Methods
+	//--------------------------------------------------------------------------------------------------
+
+public:
+	int32 						Init();
+	void  						Shutdown();
+
+	void  						BeginScene();
+	void  						EndScene();
+	void  						Flip();
+	void						SetFastFlip(bool onoff);
+
+	static int32_t				CmdBufferFull(struct CellGcmContextData * pGcmContext, uint32_t size);
+
+	// DrawPrimUP puts a drawprimup call into the call buffer, with a label and RET.
+	// It's called from the gcmdrawstate which then sends a drawcall packet to the SPU
+	uint32						DrawPrimitiveUP(D3DPRIMITIVETYPE nPrimitiveType,UINT nPrimitiveCount,
+												CONST void *pVertexStreamZeroData, UINT nVertexStreamZeroStride );
+
+	// GetRenderSurfaceBytes Note:
+	// Height alignment must be 32 for tiled surfaces on RSX
+	//                         128 for Edge Post MLAA 
+	//                          64 for Edge Post MLAA with EDGE_POST_MLAA_MODE_TRANSPOSE_64 flag set
+	inline uint GetRenderSurfaceBytes( uint nHeightAlignment = 32 ) const ; 
+
+private:
+
+	int  InitGcm();
+	int  InitVideo();
+
+	void CreateRsxBuffers();		// Display buffers and defaut allocated RTs etc..
+	void CreateIoBuffers();			// Allocs IO memory (mapped in Initgcm)
+};
+
+
+//--------------------------------------------------------------------------------------------------
+// Inlines
+//--------------------------------------------------------------------------------------------------
+
+inline uint CPs3gcmGlobalState::GetRenderSurfaceBytes( uint nHeightAlignment) const 
+{ 
+	return m_nSurfaceRenderPitch * AlignValue( m_nRenderSize[1], nHeightAlignment ); 
+}
+
+//--------------------------------------------------------------------------------------------------
+// Extern Globals
+//--------------------------------------------------------------------------------------------------
+
+extern CellGcmContextData			gGcmContext;
+extern CellGcmContextData*			gpGcmContext;
+extern CPs3gcmGlobalState			g_ps3gcmGlobalState;
+extern CellGcmContextData			gCallContext;
+extern CellGcmContextData*			gpCallContext;
+
+//--------------------------------------------------------------------------------------------------
+// Memory block funcs that need access to g_ps3gcmGlobalState
+//--------------------------------------------------------------------------------------------------
+
+#ifndef SPU
+
+inline char * CPs3gcmLocalMemoryBlock::DataInLocalMemory() const
+{
+	Assert( IsLocalMemory() );
+	return
+		( m_nLocalMemoryOffset - g_ps3gcmGlobalState.m_nLocalBaseOffset ) +
+		( char * ) g_ps3gcmGlobalState.m_pLocalBaseAddress;
+}
+
+inline char * CPs3gcmLocalMemoryBlock::DataInMainMemory() const
+{
+	Assert( !IsLocalMemory() && IsRsxMappedMemory() );
+	return
+		m_nLocalMemoryOffset +
+		( ( char * ) g_ps3gcmGlobalState.m_pIoAddress );
+}
+
+inline char * CPs3gcmLocalMemoryBlock::DataInMallocMemory() const
+{
+	Assert( !IsLocalMemory() && !IsRsxMappedMemory() );
+	return ( char * ) m_nLocalMemoryOffset;
+}
+
+inline char * CPs3gcmLocalMemoryBlock::DataInAnyMemory() const
+{
+	switch ( PS3GCMALLOCATIONPOOL( m_uType ) )
+	{
+	default: return DataInLocalMemory();
+	case kGcmAllocPoolMainMemory: return DataInMainMemory();
+	case kGcmAllocPoolMallocMemory: return DataInMallocMemory();
+	}
+
+}
+
+#endif
+
+#endif // INCLUDED_GCMSTATE_H
--- a/materialsystem/ps3gcm/gcmtexture.cpp
+++ b/materialsystem/ps3gcm/gcmtexture.cpp
@@ -0,0 +1,515 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+//
+//
+//==================================================================================================
+
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "tier1/strtools.h"
+#include "tier1/utlbuffer.h"
+
+#include "utlmap.h"
+#include "ps3gcmmemory.h"
+#include "gcmstate.h"
+#include "bitmap/imageformat_declarations.h"
+#include "gcmtexture.h"
+
+#include "memdbgon.h"
+
+#ifdef _CERT
+#define Debugger() ((void)0)
+#else
+#define Debugger() DebuggerBreak()
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// Texture Layouts
+//--------------------------------------------------------------------------------------------------
+
+#ifdef _CERT
+#define GLMTEX_FMT_DESC( x )
+#else
+#define GLMTEX_FMT_DESC( x ) x ,
+#endif
+
+#define CELL_GCM_REMAP_MODE_OIO(order, inputARGB, outputARGB) \
+	(((order)<<16)|((inputARGB))|((outputARGB)<<8))
+#define REMAPO( x ) CELL_GCM_TEXTURE_REMAP_ORDER_X##x##XY
+#define REMAP4(a,r,g,b) (((a)<<0)|((r)<<2)|((g)<<4)|((b)<<6))
+
+#define REMAP_ARGB  REMAP4( CELL_GCM_TEXTURE_REMAP_FROM_A, CELL_GCM_TEXTURE_REMAP_FROM_R, CELL_GCM_TEXTURE_REMAP_FROM_G, CELL_GCM_TEXTURE_REMAP_FROM_B )
+#define REMAP_4		REMAP4( CELL_GCM_TEXTURE_REMAP_REMAP, CELL_GCM_TEXTURE_REMAP_REMAP, CELL_GCM_TEXTURE_REMAP_REMAP, CELL_GCM_TEXTURE_REMAP_REMAP )
+#define REMAP_13	REMAP4( CELL_GCM_TEXTURE_REMAP_ONE, CELL_GCM_TEXTURE_REMAP_REMAP, CELL_GCM_TEXTURE_REMAP_REMAP, CELL_GCM_TEXTURE_REMAP_REMAP )
+#define REMAP_4X(x)	REMAP4( x, x, x, x )
+#define REMAP_13X(y, x)	REMAP4( y, x, x, x )
+
+#define REMAP_ALL_DEFAULT CELL_GCM_REMAP_MODE_OIO( REMAPO(Y), REMAP_ARGB, REMAP_4 )
+#define REMAP_ALL_DEFAULT_X CELL_GCM_REMAP_MODE_OIO( REMAPO(X), REMAP_ARGB, REMAP_4 )
+
+#define CAP( x ) CPs3gcmTextureLayout::Format_t::kCap##x
+
+CPs3gcmTextureLayout::Format_t g_ps3texFormats[PS3_TEX_MAX_FORMAT_COUNT] = 
+{
+	// summ-name						d3d-format
+	//			gcmRemap
+	//									gcmFormat
+	//			gcmPitchPer4X			gcmFlags
+
+	{ GLMTEX_FMT_DESC("_D16")			D3DFMT_D16,
+	REMAP_ALL_DEFAULT,
+	8,
+	CELL_GCM_TEXTURE_DEPTH16,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_D24X8")			D3DFMT_D24X8,
+	REMAP_ALL_DEFAULT,
+	16,
+	CELL_GCM_TEXTURE_DEPTH24_D8,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_D24S8")			D3DFMT_D24S8,
+	REMAP_ALL_DEFAULT,
+	16,
+	CELL_GCM_TEXTURE_DEPTH24_D8,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_A8R8G8B8")		D3DFMT_A8R8G8B8,
+	REMAP_ALL_DEFAULT,
+	16,
+	CELL_GCM_TEXTURE_A8R8G8B8,
+	CAP(SRGB) },
+
+	{ GLMTEX_FMT_DESC("_X8R8G8B8")		D3DFMT_X8R8G8B8,
+	REMAP_ALL_DEFAULT,
+	16,
+	CELL_GCM_TEXTURE_A8R8G8B8,
+	CAP(SRGB) },
+
+	{ GLMTEX_FMT_DESC("_X1R5G5B5")		D3DFMT_X1R5G5B5,
+	CELL_GCM_REMAP_MODE_OIO( REMAPO(X), REMAP_ARGB, REMAP_13 ),
+	8,
+	CELL_GCM_TEXTURE_R5G6B5,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_A1R5G5B5")		D3DFMT_A1R5G5B5,
+	REMAP_ALL_DEFAULT_X,
+	8,
+	CELL_GCM_TEXTURE_A1R5G5B5,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_L8")			D3DFMT_L8,
+	CELL_GCM_REMAP_MODE_OIO( REMAPO(Y), REMAP_4X(CELL_GCM_TEXTURE_REMAP_FROM_B), REMAP_13 ),
+	4,
+	CELL_GCM_TEXTURE_B8,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_A8L8")			D3DFMT_A8L8,
+	CELL_GCM_REMAP_MODE_OIO( REMAPO(Y), REMAP_13X( CELL_GCM_TEXTURE_REMAP_FROM_G, CELL_GCM_TEXTURE_REMAP_FROM_B), REMAP_4 ),
+	8,
+	CELL_GCM_TEXTURE_G8B8,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_DXT1")			D3DFMT_DXT1,
+	CELL_GCM_REMAP_MODE_OIO( REMAPO(Y), REMAP_ARGB, REMAP_13 ),
+	8,
+	CELL_GCM_TEXTURE_COMPRESSED_DXT1,
+	CAP(SRGB) | CAP(4xBlocks) },
+
+	{ GLMTEX_FMT_DESC("_DXT3")			D3DFMT_DXT3,
+	REMAP_ALL_DEFAULT,
+	16,
+	CELL_GCM_TEXTURE_COMPRESSED_DXT23,
+	CAP(SRGB) | CAP(4xBlocks) },
+
+	{ GLMTEX_FMT_DESC("_DXT5")			D3DFMT_DXT5,
+	REMAP_ALL_DEFAULT,
+	16,
+	CELL_GCM_TEXTURE_COMPRESSED_DXT45,
+	CAP(SRGB) | CAP(4xBlocks) },
+
+	{ GLMTEX_FMT_DESC("_A16B16G16R16F")	D3DFMT_A16B16G16R16F,
+	REMAP_ALL_DEFAULT_X,
+	32,
+	CELL_GCM_TEXTURE_W16_Z16_Y16_X16_FLOAT,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_A16B16G16R16")	D3DFMT_A16B16G16R16,
+	REMAP_ALL_DEFAULT_X,
+	64,
+	CELL_GCM_TEXTURE_W32_Z32_Y32_X32_FLOAT,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_A32B32G32R32F")	D3DFMT_A32B32G32R32F,
+	REMAP_ALL_DEFAULT_X,
+	64,
+	CELL_GCM_TEXTURE_W32_Z32_Y32_X32_FLOAT,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_R8G8B8")		D3DFMT_R8G8B8,
+	CELL_GCM_REMAP_MODE_OIO( REMAPO(Y),
+	REMAP4( CELL_GCM_TEXTURE_REMAP_FROM_B, CELL_GCM_TEXTURE_REMAP_FROM_A, CELL_GCM_TEXTURE_REMAP_FROM_R, CELL_GCM_TEXTURE_REMAP_FROM_G ),
+	REMAP_13 ),
+	16,
+	CELL_GCM_TEXTURE_A8R8G8B8,
+	CAP(SRGB) },
+
+	{ GLMTEX_FMT_DESC("_A8")			D3DFMT_A8,
+	CELL_GCM_REMAP_MODE_OIO( REMAPO(Y),
+	REMAP4( CELL_GCM_TEXTURE_REMAP_FROM_B, CELL_GCM_TEXTURE_REMAP_FROM_R, CELL_GCM_TEXTURE_REMAP_FROM_B, CELL_GCM_TEXTURE_REMAP_FROM_B ),
+	REMAP_13X( CELL_GCM_TEXTURE_REMAP_REMAP, CELL_GCM_TEXTURE_REMAP_ZERO ) ),
+	4,
+	CELL_GCM_TEXTURE_B8,
+	0 },
+
+	{ GLMTEX_FMT_DESC("_R5G6B5")		D3DFMT_R5G6B5,
+	CELL_GCM_REMAP_MODE_OIO( REMAPO(Y),
+	REMAP4( CELL_GCM_TEXTURE_REMAP_FROM_B, CELL_GCM_TEXTURE_REMAP_FROM_A, CELL_GCM_TEXTURE_REMAP_FROM_R, CELL_GCM_TEXTURE_REMAP_FROM_G ),
+	REMAP_13 ),
+	16,
+	CELL_GCM_TEXTURE_A8R8G8B8,
+	CAP(SRGB) },
+
+	{ GLMTEX_FMT_DESC("_Q8W8V8U8")		D3DFMT_Q8W8V8U8,
+	REMAP_ALL_DEFAULT,
+	16,
+	CELL_GCM_TEXTURE_A8R8G8B8,
+	CAP(SRGB) },
+};
+
+uint g_nPs3texFormatCount = PS3_TEX_CANONICAL_FORMAT_COUNT;
+
+#undef CAP
+#undef GLMTEX_FMT_DESC
+
+static bool Ps3texLayoutLessFunc( CPs3gcmTextureLayout::Key_t const &a, CPs3gcmTextureLayout::Key_t const &b )
+{
+	return ( memcmp( &a, &b, sizeof( CPs3gcmTextureLayout::Key_t ) ) < 0 );
+}
+static CUtlMap< CPs3gcmTextureLayout::Key_t, CPs3gcmTextureLayout const * > s_ps3texLayouts( Ps3texLayoutLessFunc );
+
+CPs3gcmTextureLayout const * CPs3gcmTextureLayout::New( Key_t const &k )
+{
+	// look up 'key' in the map and see if it's a hit, if so, bump the refcount and return
+	// if not, generate a completed layout based on the key, add to map, set refcount to 1, return that
+	unsigned short index = s_ps3texLayouts.Find( k );
+	if ( index != s_ps3texLayouts.InvalidIndex() )
+	{
+		CPs3gcmTextureLayout const *layout = s_ps3texLayouts[ index ];
+		++ layout->m_refCount;
+		return layout;
+	}
+
+	// Need to generate complete information about the texture layout
+	uint8 nMips = ( k.m_texFlags & kfMip ) ? k.m_nActualMipCount : 1;
+	uint8 nFaces = ( k.m_texFlags & kfTypeCubeMap ) ? 6 : 1;
+	uint32 nSlices = nMips * nFaces;
+
+	// Allocate layout memory
+	size_t numLayoutBytes = sizeof( CPs3gcmTextureLayout ) + nSlices * sizeof( Slice_t );
+	CPs3gcmTextureLayout *layout = ( CPs3gcmTextureLayout * ) MemAlloc_AllocAligned( numLayoutBytes, 16 );
+	memset( layout, 0, numLayoutBytes );
+	memcpy( &layout->m_key, &k, sizeof( Key_t ) );
+	layout->m_refCount = 1;
+
+	// Find the format descriptor
+	for ( int j = 0; j < PS3_TEX_CANONICAL_FORMAT_COUNT; ++ j )
+	{
+		if ( g_ps3texFormats[j].m_d3dFormat == k.m_texFormat )
+		{
+			layout->m_nFormat = j;
+			break;
+		}
+		Assert( j != PS3_TEX_CANONICAL_FORMAT_COUNT - 1 );
+	}
+
+	layout->m_mipCount = nMips;
+
+	//
+	// Slices
+	//
+	bool bSwizzled = layout->IsSwizzled();
+	size_t fmtPitch = layout->GetFormatPtr()->m_gcmPitchPer4X;
+	size_t fmtPitchBlock = ( layout->GetFormatPtr()->m_gcmCaps & CPs3gcmTextureLayout::Format_t::kCap4xBlocks ) ? 16 : 4;
+	size_t numDataBytes = 0;
+	Slice_t *pSlice = &layout->m_slices[0];
+	for ( int face = 0; face < nFaces; ++ face )
+	{
+		// For cubemaps every next face in swizzled addressing
+		// must be aligned on 128-byte boundary
+		if ( bSwizzled )
+		{
+			numDataBytes = ( numDataBytes + 127 ) & ~127;
+		}
+
+		for ( int mip = 0; mip < nMips; ++ mip, ++ pSlice )
+		{
+			for ( int j = 0; j < ARRAYSIZE( k.m_size ); ++ j )
+			{
+				pSlice->m_size[j] = k.m_size[j] >> mip;
+				pSlice->m_size[j] = MAX( pSlice->m_size[j], 1 );
+			}
+			pSlice->m_storageOffset = numDataBytes;
+
+			size_t numTexels;
+			// For linear layout textures every mip row must be padded to the
+			// width of the original highest level mip so that the pitch was
+			// the same for every mip
+			if ( bSwizzled )
+				numTexels = ( pSlice->m_size[0] * pSlice->m_size[1] * pSlice->m_size[2] );
+			else
+				numTexels = ( k.m_size[0] * pSlice->m_size[1] * pSlice->m_size[2] );
+
+			size_t numBytes = ( numTexels * fmtPitch ) / fmtPitchBlock;
+
+			if ( layout->GetFormatPtr()->m_gcmCaps & CPs3gcmTextureLayout::Format_t::kCap4xBlocks )
+			{
+				// Ensure the size of the smallest mipmap levels of DXT1/3/5 textures (the 1x1 and 2x2 mips) is accurately computed.
+				numBytes = MAX( numBytes, fmtPitch );
+			}
+
+			pSlice->m_storageSize = MAX( numBytes, 1 );
+
+			numDataBytes += pSlice->m_storageSize;
+		}
+	}
+	// Make the total size 128-byte aligned
+	// Realistically it is required only for depth textures
+	numDataBytes = ( numDataBytes + 127 ) & ~127;
+
+	//
+	// Tiled and ZCull memory adjustments
+	//
+	layout->m_gcmAllocType = GCM_MAINPOOLSIZE ? kAllocPs3gcmTextureData0 : kAllocPs3gcmTextureData;
+
+
+	if ( layout->IsTiledMemory() )
+	{
+		if( g_nPs3texFormatCount >= PS3_TEX_MAX_FORMAT_COUNT )
+		{
+			Error("Modified ps3 format array overflow. Increase PS3_TEX_MAX_FORMAT_COUNT appropriately and recompile\n");
+		}
+		Format_t *pModifiedFormat = &g_ps3texFormats[g_nPs3texFormatCount];
+		V_memcpy( pModifiedFormat, layout->GetFormatPtr(), sizeof( Format_t ) );
+		layout->m_nFormat = g_nPs3texFormatCount;
+		g_nPs3texFormatCount ++;
+
+		if ( k.m_texFlags & kfTypeDepthStencil )
+		{
+			//
+			// Tiled Zcull Surface
+			//
+			uint32 zcullSize[2] = { AlignValue( k.m_size[0], 64 ), AlignValue( k.m_size[1], 64 ) };
+			uint32 nDepthPitch;
+							
+			if ( k.m_texFormat == D3DFMT_D16 ) 
+				nDepthPitch = cellGcmGetTiledPitchSize( zcullSize[0] * 2 );
+			else
+				nDepthPitch = cellGcmGetTiledPitchSize( zcullSize[0] * 4 );
+			
+			pModifiedFormat->m_gcmPitchPer4X = nDepthPitch;
+
+			uint32 uDepthBufferSize32bpp = nDepthPitch * zcullSize[1];
+			uDepthBufferSize32bpp = AlignValue( uDepthBufferSize32bpp, PS3GCMALLOCATIONALIGN( kAllocPs3gcmDepthBuffer ) );
+
+			Assert( uDepthBufferSize32bpp >= numDataBytes );
+			numDataBytes = uDepthBufferSize32bpp;
+
+			layout->m_gcmAllocType = kAllocPs3gcmDepthBuffer;
+		}
+		else
+		{
+			//
+			// Tiled Color Surface
+			//
+			uint32 nTiledPitch = cellGcmGetTiledPitchSize( k.m_size[0] * layout->GetFormatPtr()->m_gcmPitchPer4X / 4 );
+			pModifiedFormat->m_gcmPitchPer4X = nTiledPitch;
+
+            // We Don't allocate any 512x512 RTs (they are used only when in PAL576i which can use the FB mem pool)
+			/*if ( k.m_size[0] == 512 && k.m_size[1] == 512 && k.m_size[2] == 1 )
+				layout->m_gcmAllocType = kAllocPs3gcmColorBuffer512;
+			else*/ 
+            
+            if ( k.m_size[0] == g_ps3gcmGlobalState.m_nRenderSize[0] && k.m_size[1] == g_ps3gcmGlobalState.m_nRenderSize[1] && k.m_size[2] == 1 )
+				layout->m_gcmAllocType = kAllocPs3gcmColorBufferFB;
+			else if ( k.m_size[0] == g_ps3gcmGlobalState.m_nRenderSize[0]/4 && k.m_size[1] == g_ps3gcmGlobalState.m_nRenderSize[1]/4 && k.m_size[2] == 1 )
+				layout->m_gcmAllocType = kAllocPs3gcmColorBufferFBQ;
+			else
+				layout->m_gcmAllocType = kAllocPs3gcmColorBufferMisc;
+
+			uint32 uRenderSize = nTiledPitch * AlignValue( k.m_size[1], 32 ); // 32-line vertical alignment required in local memory
+			if ( layout->m_gcmAllocType == kAllocPs3gcmColorBufferMisc )
+				uRenderSize = AlignValue( uRenderSize, PS3GCMALLOCATIONALIGN( kAllocPs3gcmColorBufferMisc ) );
+
+			Assert( uRenderSize >= numDataBytes );
+			numDataBytes = uRenderSize;
+		}
+	}
+
+	layout->m_storageTotalSize = numDataBytes;
+
+	//
+	// Finished creating the layout information
+	//
+
+#ifndef _CERT
+	// generate summary
+	// "target, format, +/- mips, base size"
+	char scratch[1024];
+
+	char	*targetname = targetname = "2D  ";
+	if ( layout->IsVolumeTex() )
+		targetname = "3D  ";
+	if ( layout->IsCubeMap() )
+		targetname = "CUBE";
+
+	sprintf( scratch, "[%s %s %dx%dx%d mips=%d slices=%d flags=%02X%s]",
+		targetname,
+		layout->GetFormatPtr()->m_formatSummary,
+		layout->m_key.m_size[0], layout->m_key.m_size[1], layout->m_key.m_size[2],
+		nMips,
+		nSlices,
+		layout->m_key.m_texFlags,
+		(layout->m_key.m_texFlags & kfSrgbEnabled) ? " SRGB" : ""
+		);
+	layout->m_layoutSummary = strdup( scratch );
+#endif
+
+	// then insert into map. disregard returned index.
+	s_ps3texLayouts.Insert( k, layout );
+
+	return layout;
+}
+
+void CPs3gcmTextureLayout::Release() const
+{
+	-- m_refCount;
+	// keep the layout in the map for easy access
+	Assert( m_refCount >= 0 );
+}
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Texture management
+//
+
+CPs3gcmTexture * CPs3gcmTexture::New( CPs3gcmTextureLayout::Key_t const &key )
+{
+	//
+	// Allocate a new layout for the texture
+	//
+	CPs3gcmTextureLayout const *pLayout = CPs3gcmTextureLayout::New( key );
+	if ( !pLayout )
+	{
+		Debugger();
+		return NULL;
+	}
+
+	CPs3gcmTexture *tex = (CPs3gcmTexture *)MemAlloc_AllocAligned( sizeof( CPs3gcmTexture ), 16 );
+	memset( tex, 0, sizeof( CPs3gcmTexture ) ); // NOTE: This clears the CPs3gcmLocalMemoryBlock
+
+	tex->m_layout = pLayout;
+	CPs3gcmAllocationType_t uAllocationType = pLayout->m_gcmAllocType;
+
+	if ( key.m_texFlags & CPs3gcmTextureLayout::kfNoD3DMemory )
+	{
+		if ( ( uAllocationType == kAllocPs3gcmDepthBuffer ) || ( uAllocationType == kAllocPs3gcmColorBufferMisc ) )
+		{
+			Assert( 0 );
+			Warning( "ERROR: (CPs3gcmTexture::New) depth/colour buffers should not be marked with kfNoD3DMemory!\n" );
+		}
+		else
+		{
+			// Early-out, storage will be allocated later (via IDirect3DDevice9::AllocateTextureStorage)
+			return tex;
+		}
+	}
+
+	tex->Allocate();
+
+	return tex;
+}
+
+void CPs3gcmTexture::Release()
+{
+	// Wait for RSX to finish using the texture memory
+	// and free it later
+	if ( m_lmBlock.Size() )
+	{
+		m_lmBlock.Free();
+	}
+	m_layout->Release();
+	MemAlloc_FreeAligned( this );
+}
+
+bool CPs3gcmTexture::Allocate()
+{
+	if ( m_lmBlock.Size() )
+	{
+		// Already allocated!
+		Assert( 0 );
+		Warning( "ERROR: CPs3gcmTexture::Allocate called twice!\n" );
+		return true;
+	}
+
+	CPs3gcmAllocationType_t uAllocationType = m_layout->m_gcmAllocType;
+	const CPs3gcmTextureLayout::Key_t & key = m_layout->m_key;
+
+	// if kAllocPs3gcmTextureData0 (main memory) fails try kAllocPs3gcmTextureData
+
+	if (!m_lmBlock.Alloc( uAllocationType, m_layout->m_storageTotalSize ) )
+	{
+		if (m_layout->m_gcmAllocType == kAllocPs3gcmTextureData0)
+		{
+			m_layout->m_gcmAllocType = kAllocPs3gcmTextureData;
+			CPs3gcmAllocationType_t uAllocationType = m_layout->m_gcmAllocType;
+			m_lmBlock.Alloc( uAllocationType, m_layout->m_storageTotalSize );
+		}
+	}
+
+	if ( m_layout->IsTiledMemory() )
+	{
+		if ( uAllocationType == kAllocPs3gcmDepthBuffer )
+		{
+			bool bIs16BitDepth = ( m_layout->GetFormatPtr()->m_gcmFormat == CELL_GCM_TEXTURE_DEPTH16 ) || ( m_layout->m_nFormat == CELL_GCM_TEXTURE_DEPTH16_FLOAT );
+
+			uint32 zcullSize[2] = { AlignValue( key.m_size[0], 64 ), AlignValue( key.m_size[1], 64 ) };
+
+			uint32 uiZcullIndex = m_lmBlock.ZcullMemoryIndex();
+			cellGcmBindZcull( uiZcullIndex,
+				m_lmBlock.Offset(),
+				zcullSize[0], zcullSize[1],
+				m_lmBlock.ZcullMemoryStart(),
+				bIs16BitDepth ? CELL_GCM_ZCULL_Z16 : CELL_GCM_ZCULL_Z24S8,
+				CELL_GCM_SURFACE_CENTER_1,
+				CELL_GCM_ZCULL_LESS,
+				CELL_GCM_ZCULL_LONES,
+				CELL_GCM_SCULL_SFUNC_ALWAYS,
+				0, 0	// sRef, sMask
+				);
+
+			uint32 uiTileIndex = m_lmBlock.TiledMemoryIndex();
+			cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL, m_lmBlock.Offset(),
+				m_layout->m_storageTotalSize, m_layout->DefaultPitch(), bIs16BitDepth ? CELL_GCM_COMPMODE_DISABLED : CELL_GCM_COMPMODE_Z32_SEPSTENCIL_REGULAR,
+				m_lmBlock.TiledMemoryTagAreaBase(), // The area base + size/0x10000 will be allocated as the tag area.
+				1 );	// Misc depth buffers on bank 1
+			cellGcmBindTile( uiTileIndex );
+		}
+		else if ( uAllocationType == kAllocPs3gcmColorBufferMisc )
+		{
+			uint32 uiTileIndex = m_lmBlock.TiledMemoryIndex();
+			cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL, m_lmBlock.Offset(),
+				m_layout->m_storageTotalSize, m_layout->DefaultPitch(), CELL_GCM_COMPMODE_DISABLED,
+				m_lmBlock.TiledMemoryTagAreaBase(), // The area base + size/0x10000 will be allocated as the tag area.
+				1 );	// Tile misc color buffers on bank 1
+			cellGcmBindTile( uiTileIndex );
+		}
+	}
+
+#ifdef _DEBUG
+	memset( Data(), 0, m_layout->m_storageTotalSize );	// initialize texture data to BLACK in DEBUG
+#endif
+
+	return true;
+}
+
--- a/materialsystem/ps3gcm/gcmtexture.h
+++ b/materialsystem/ps3gcm/gcmtexture.h
@@ -0,0 +1,250 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Texture Layout, CPs3gcmTexture, and CPs3gcmTextureData_t 
+//
+//==================================================================================================
+
+#ifndef INCLUDED_GCMTEXTURE_H
+#define INCLUDED_GCMTEXTURE_H
+
+#include "ps3/ps3_platform.h"
+
+#include "ps3gcmmemory.h"
+#include "gcmstate.h"
+
+//--------------------------------------------------------------------------------------------------
+// Literals
+//--------------------------------------------------------------------------------------------------
+
+#define PS3_TEX_MAX_FORMAT_COUNT 48
+#define PS3_TEX_CANONICAL_FORMAT_COUNT 19
+
+//--------------------------------------------------------------------------------------------------
+// Texture layout, texture etc..
+//--------------------------------------------------------------------------------------------------
+
+struct ALIGN16 CPs3gcmTextureLayout
+{
+#ifndef _CERT
+	char		*m_layoutSummary;	// for debug visibility
+#endif
+
+	// format mapping description
+	struct ALIGN16 Format_t
+	{
+#ifndef _CERT
+		char		*m_formatSummary;	// for debug visibility
+#endif
+
+		enum GcmCaps_t
+		{
+			kCapSRGB		=	(1<<0),		// GCM can sample it as SRGB
+			kCap4xBlocks	=	(1<<1),		// Pitch is referring to 4 texel blocks and not single texel blocks (DXT)
+		};
+
+		D3DFORMAT	m_d3dFormat;		// what D3D knows it as; see public/bitmap/imageformat.h
+
+		uint32		m_gcmRemap;			// GCM remap mask
+		uint16		m_gcmPitchPer4X;	// GCM pitch multiplier per every 4 pixels of width
+		uint8		m_gcmFormat;		// GCM format
+		uint8		m_gcmCaps;			// GCM caps of this texture
+	}
+	ALIGN16_POST;
+
+	// const inputs used for hashing
+	struct Key_t
+	{
+		D3DFORMAT			m_texFormat;				// D3D texel format
+		uint16				m_size[3];					// dimensions of the base mip
+		uint8				m_texFlags;					// mipped, autogen mips, render target, ... ?
+		uint8				m_nActualMipCount;			// Actual number of mips; on console builds, we typically drop the smallest (highest index) 
+		// mips to save space (they waste a lot of space for page-alignment reasons)
+		// high-bit 0x80 indicates cubemap
+	};
+
+	// layout flags
+	enum Flags_t
+	{
+		kfDynamicNoSwizzle	=	(1<<0),		// Indicates whether this texture needs to keep a backing store for incremental updates.
+		// (On PS3 this will prevent texture from being swizzled to allow CPU writes at subrect offsets)
+		kfMip				=	(1<<1),
+		kfMipAuto			=	(1<<2),
+		kfTypeRenderable	=	(1<<3),
+		kfTypeDepthStencil	=	(1<<4),
+		kfTypeCubeMap		=	(1<<5),
+		kfSrgbEnabled		=	(1<<6),
+		kfNoD3DMemory		=	(1<<7),		// Allocation of storage for the bits has been deferred (call IDirect3DDevice9::AllocateTextureStorage to do the allocation)
+		//   -!!--!!- DO NOT ADD MORE FLAGS -!!--!!-  (m_texFlags is only 8 bits)
+	};
+
+	// slice information
+	struct Slice_t
+	{
+		uint32	m_storageOffset;	//where in the storage slab does this slice live
+		uint32	m_storageSize;		//how much storage does this slice occupy
+		uint16	m_size[3];			//texel dimensions of this slice
+	};
+
+	//
+	// Structure definition
+	//
+
+	Key_t m_key;										// key of the layout
+	int32 mutable			m_refCount;					// refcount
+	uint32					m_storageTotalSize;			// size of storage slab required
+	uint16                  m_nFormat;					// format specific info; index in g_ps3texFormats table
+	uint8					m_mipCount;					// derived by starting at base size and working down towards 1x1
+	CPs3gcmAllocationType_t mutable m_gcmAllocType;				// type of GCM allocation to determine pool/alignment/etc.
+#ifndef SPU
+	// slice array
+	Slice_t					m_slices[0];				// dynamically allocated 2-d array [faces][mips]
+public:
+	inline int SlicePitch( int iSlice ) const;
+	inline int DefaultPitch() const;
+	inline const Format_t * GetFormatPtr()const;
+#endif
+public:
+
+	inline bool IsSwizzled() const { return !( m_key.m_texFlags & ( kfDynamicNoSwizzle | kfTypeRenderable ) ) && IsPowerOfTwo( m_key.m_size[0] ) && IsPowerOfTwo( m_key.m_size[1] ) && IsPowerOfTwo( m_key.m_size[2] ); }
+	inline bool IsCubeMap() const { return !!(m_key.m_texFlags & kfTypeCubeMap); }
+	inline bool IsVolumeTex() const { return !!(m_key.m_size[2] > 1); }
+	inline bool IsTiledMemory() const { return (m_key.m_texFlags & ( kfTypeRenderable | kfDynamicNoSwizzle )) == kfTypeRenderable; }
+
+	inline int FaceCount() const { return ( !IsCubeMap() ) ? 1 : 6; }
+	inline int MipCount() const { return ( m_key.m_texFlags & kfMip ) ? m_key.m_nActualMipCount : 1; }
+
+	inline int SlicePitch2( int iSlice, const Slice_t* pSlices, const Format_t *pTexFormats ) const{ return !IsTiledMemory() ? ( ( IsSwizzled() ? pSlices[iSlice].m_size[0] : m_key.m_size[0] ) * pTexFormats[m_nFormat].m_gcmPitchPer4X / 4 ) : pTexFormats[m_nFormat].m_gcmPitchPer4X; }
+	inline int DefaultPitch2( const Format_t *pTexFormats ) const { return !IsTiledMemory() ? m_key.m_size[0] * pTexFormats[m_nFormat].m_gcmPitchPer4X / 4 : pTexFormats[m_nFormat].m_gcmPitchPer4X; }
+
+	inline int SliceIndex( int face, int mip ) const { return mip + ( face * MipCount() ); }
+
+public:
+#ifndef SPU
+	static CPs3gcmTextureLayout const * New( Key_t const &k );
+	void Release() const;
+#endif
+}
+ALIGN16_POST;
+
+extern CPs3gcmTextureLayout::Format_t g_ps3texFormats[PS3_TEX_MAX_FORMAT_COUNT];
+extern uint g_nPs3texFormatCount;
+
+
+#ifndef SPU
+// convenience functions on PPU that use implicit tables always accessible on PPU
+inline int CPs3gcmTextureLayout::SlicePitch( int iSlice ) const
+{
+	return SlicePitch2( iSlice, &m_slices[0], g_ps3texFormats );
+}
+inline int CPs3gcmTextureLayout::DefaultPitch() const
+{
+	return DefaultPitch2( g_ps3texFormats );
+}
+inline const CPs3gcmTextureLayout::Format_t * CPs3gcmTextureLayout::GetFormatPtr()const
+{
+	return &g_ps3texFormats[ m_nFormat ];
+}
+#endif
+
+
+
+struct ALIGN16 CPs3gcmTexture
+{
+	CPs3gcmTextureLayout const *m_layout;  // this structure persists. see CPs3gcmTextureLayout::Release( it asserts if refcount goes down to zero )
+	ALIGN16 CPs3gcmLocalMemoryBlock m_lmBlock ALIGN16_POST;     // this structure has the Offset, and the texture bits at that offset persist until all Draw calls are made that use it
+
+	inline uint32 Offset()const { Assert( m_lmBlock.Size() ); return m_lmBlock.Offset(); }
+#ifndef SPU
+	inline char * Data() { Assert( m_lmBlock.Size() ); return m_lmBlock.DataInAnyMemory(); }
+#endif
+public:
+#ifndef SPU
+	static CPs3gcmTexture * New( CPs3gcmTextureLayout::Key_t const &key );
+	void Release();
+	bool Allocate();
+#endif
+}
+ALIGN16_POST;
+
+struct CPs3gcmTextureData_t
+{
+	// CPs3gcmTextureLayout const *m_eaLayout
+	uint32 m_eaLayout;  // this structure persists. see CPs3gcmTextureLayout::Release( it asserts if refcount goes down to zero )
+	uint32 m_nLocalOffset; // the offset of the texture bits
+
+	void Assign( const CPs3gcmTexture * pThat )
+	{
+		if( pThat )
+		{
+			m_eaLayout = ( uint32 )pThat->m_layout;
+			m_nLocalOffset = pThat->Offset();
+			Assert( m_eaLayout ? !( 15 & ( uintp( m_eaLayout ) | m_nLocalOffset ) ) && m_nLocalOffset : !m_nLocalOffset );
+		}
+		else
+		{
+			Reset();
+		}
+	}
+
+	inline uint32 Offset()const { return m_nLocalOffset; }
+
+	void Reset()
+	{
+		m_eaLayout = 0;
+		m_nLocalOffset = 0;
+	}
+
+	bool IsNull()const
+	{
+		return !NotNull();
+	}
+	bool NotNull()const
+	{
+		// either both are null, or none is null
+		Assert( ( m_eaLayout == 0 ) == ( m_nLocalOffset == 0 ) );
+		return m_eaLayout != 0;
+	}
+
+	operator bool() const { return NotNull(); }
+};
+
+
+//
+// CPs3BindTexture_t : Everything we need to bind a texture
+//
+
+// This is what the SPU needs to bind the texture
+
+struct CPs3BindTexture_t
+{
+	uint8					m_sampler;
+	uint8					m_nBindFlags;
+	uint8					m_UWrap;
+	uint8					m_VWrap;
+	uint8					m_WWrap;
+	uint8					m_minFilter;
+	uint8					m_magFilter;
+	uint8					m_mipFilter;
+
+	uint32					m_nLayout;
+	CPs3gcmLocalMemoryBlock *m_pLmBlock;
+
+	int						m_boundStd;
+	int						m_hTexture;
+};
+
+// This is what we store when asked to bind a texture
+// When the cmd buffer is executed, at this time we lookup 
+// the remaining fields and pack some CPs3BindTexture_t to actually use on the SPU
+
+struct CPs3BindParams_t
+{
+    uint16                  m_nBindTexIndex;
+    uint8					m_sampler;
+    uint8                   m_nBindFlags;
+    int						m_boundStd;
+    int						m_hTexture;
+};
+
+
+#endif // INCLUDED_GCMTEXTURE_H
--- a/materialsystem/ps3gcm/noninteractiveshader.fp
+++ b/materialsystem/ps3gcm/noninteractiveshader.fp
@@ -0,0 +1,12 @@
+//===== Copyright (c) 1996-2008, Valve Corporation, All rights reserved. ======//
+struct PS_IN
+{
+	float2 TexCoord : TEXCOORD;
+};
+
+sampler detail : register( s0 );
+
+float4 main( PS_IN In ) : COLOR  
+{  
+	return tex2D( detail, In.TexCoord );
+}
--- a/materialsystem/ps3gcm/noninteractiveshader.vp
+++ b/materialsystem/ps3gcm/noninteractiveshader.vp
@@ -0,0 +1,23 @@
+//===== Copyright (c) 1996-2008, Valve Corporation, All rights reserved. ======//
+
+float4x4 matWVP : register(c0);  
+
+struct VS_IN  
+{ 
+	float4 ObjPos : POSITION;
+	float2 TexCoord : TEXCOORD;
+}; 
+
+struct VS_OUT 
+{ 
+	float4 ProjPos : POSITION; 
+	float2 TexCoord : TEXCOORD;
+};  
+
+VS_OUT main( VS_IN In )  
+{  
+	VS_OUT Out;   
+	Out.ProjPos = mul( In.ObjPos, matWVP );
+	Out.TexCoord = In.TexCoord;
+	return Out;  
+}
--- a/materialsystem/ps3gcm/noninteractiveshaderstartup.fp
+++ b/materialsystem/ps3gcm/noninteractiveshaderstartup.fp
@@ -0,0 +1,13 @@
+//===== Copyright (c) 1996-2008, Valve Corporation, All rights reserved. ======//
+
+struct PS_IN
+{
+	float2 TexCoord : TEXCOORD;
+};
+
+sampler detail : register( s0 );
+
+float4 main( PS_IN In ) : COLOR  
+{  
+	return tex2D( detail, In.TexCoord );
+}
--- a/materialsystem/ps3gcm/noninteractiveshaderstartuppass2.fp
+++ b/materialsystem/ps3gcm/noninteractiveshaderstartuppass2.fp
@@ -0,0 +1,54 @@
+//===== Copyright (c) 1996-2008, Valve Corporation, All rights reserved. ======//
+struct PS_IN
+{
+	float2 TexCoord : TEXCOORD;
+};
+
+float SrgbGammaToLinear( float flSrgbGammaValue )
+{
+	float x = saturate( flSrgbGammaValue );
+	return ( x <= 0.04045f ) ? ( x / 12.92f ) : ( pow( ( x + 0.055f ) / 1.055f, 2.4f ) );
+}
+
+
+float X360LinearToGamma( float flLinearValue )
+{
+	float fl360GammaValue;
+	flLinearValue = saturate( flLinearValue );
+	if ( flLinearValue < ( 128.0f / 1023.0f ) )
+	{
+		if ( flLinearValue < ( 64.0f / 1023.0f ) )
+		{
+			fl360GammaValue = flLinearValue * ( 1023.0f * ( 1.0f / 255.0f ) );
+		}
+		else
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f / 2.0f ) * ( 1.0f / 255.0f ) ) + ( 32.0f / 255.0f );
+		}
+	}
+	else
+	{
+		if ( flLinearValue < ( 512.0f / 1023.0f ) )
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f / 4.0f ) * ( 1.0f / 255.0f ) ) + ( 64.0f / 255.0f );
+		}
+		else
+		{
+			fl360GammaValue = flLinearValue * ( ( 1023.0f /8.0f ) * ( 1.0f / 255.0f ) ) + ( 128.0f /255.0f );
+			if ( fl360GammaValue > 1.0f )
+			{
+				fl360GammaValue = 1.0f;
+			}
+		}
+	}
+	fl360GammaValue = saturate( fl360GammaValue );
+	return fl360GammaValue;
+}
+
+sampler detail : register( s0 );
+
+float4 main( PS_IN In ) : COLOR  
+{  
+	float4 vTextureColor = tex2D( detail, In.TexCoord );
+	return vTextureColor;
+};
--- a/materialsystem/ps3gcm/ps3gcmlocalmemoryallocator.cpp
+++ b/materialsystem/ps3gcm/ps3gcmlocalmemoryallocator.cpp
@@ -0,0 +1,941 @@
+//========== Copyright <20> 2010, Valve Corporation, All rights reserved. ========
+
+#include "dxabstract.h"
+#include "ps3gcmstate.h"
+#include "utlmap.h"
+#include "ps3/ps3gcmlabels.h"
+#include "sys/tty.h"
+#include "convar.h"
+//#include "vjobs/spudrawqueue_shared.h"
+#include "spugcm.h"
+
+#include "memdbgon.h"
+
+PLATFORM_OVERRIDE_MEM_ALLOC_INTERNAL_PS3_IMPL
+
+//////////////////////////////////////////////////////////////////////////
+
+#if 1 // #ifndef _CERT
+#define TRACK_ALLOC_STATS 1
+#endif
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+ConVar r_ps3_gcmnocompact( "r_ps3_gcmnocompact", "0" );
+ConVar r_ps3_gcmlowcompact( "r_ps3_gcmlowcompact", "0" );
+#endif
+
+static CThreadFastMutex s_AllocMutex;
+static int32 s_uiGcmLocalMemoryAllocatorMutexLockCount;
+struct CGcmLocalMemoryAllocatorMutexLockCounter_t
+{
+	CGcmLocalMemoryAllocatorMutexLockCounter_t() { Assert( s_uiGcmLocalMemoryAllocatorMutexLockCount >= 0 ); ++ s_uiGcmLocalMemoryAllocatorMutexLockCount; }
+	~CGcmLocalMemoryAllocatorMutexLockCounter_t() { Assert( s_uiGcmLocalMemoryAllocatorMutexLockCount > 0 ); -- s_uiGcmLocalMemoryAllocatorMutexLockCount; }
+};
+#define PS3ALLOCMTX AUTO_LOCK( s_AllocMutex ); CGcmLocalMemoryAllocatorMutexLockCounter_t aLockCounter;
+bool IsItSafeToRefreshFrontBufferNonInteractivePs3()
+{
+	// NOTE: only main thread can refresh front buffer
+	if ( !ThreadInMainThread() )
+		return false;
+	
+	AUTO_LOCK( s_AllocMutex );
+	Assert( s_uiGcmLocalMemoryAllocatorMutexLockCount >= 0 );
+	return s_uiGcmLocalMemoryAllocatorMutexLockCount <= 0;
+}
+
+struct CPs3gcmLocalMemoryBlockMutable : public CPs3gcmLocalMemoryBlock
+{
+	inline uint32 & MutableOffset() { return m_nLocalMemoryOffset; }
+	inline uint32 & MutableSize() { return m_uiSize; }
+	inline CPs3gcmAllocationType_t & MutableType() { return m_uType; }
+	inline uint32 & MutableIndex() { return m_uiIndex; }
+};
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+static const uint64 g_GcmLocalMemoryBlockDebugCookieAllocated = 0xA110CA7EDA110CA7ull;
+static const uint64 g_GcmLocalMemoryBlockDebugCookieFree = 0xFEEFEEFEEFEEFEEFllu;
+#endif
+
+struct CPs3gcmLocalMemoryAllocator
+{
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Allocated memory tracking
+	//
+	uint32 m_nOffsetMin;	// RSX Local Memory allocated by Initialization that will never be released
+	uint32 m_nOffsetMax;	// Ceiling of allocatable RSX Local Memory (because the top portion is reserved for zcull/etc.), top portion managed separately
+	uint32 m_nOffsetUnallocated; // RSX Local Memory offset of not yet allocated memory (between Min and Max)
+
+	CUtlVector< CPs3gcmLocalMemoryBlockMutable * > m_arrAllocations;	// Sorted array of all allocations
+
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Free blocks tracking
+	//
+	struct LocalMemoryAllocation_t
+	{
+		CPs3gcmLocalMemoryBlockMutable m_block;
+		uint32 m_uiFenceNumber;
+		LocalMemoryAllocation_t *m_pNext;
+	};
+
+	LocalMemoryAllocation_t *m_pPendingFreeBlock;
+	LocalMemoryAllocation_t *m_pFreeBlock;
+
+	static uint32 sm_uiFenceNumber;
+	uint32 m_uiFenceLastKnown;
+	static uint32 volatile *sm_puiFenceLocation;
+
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Implementation
+	//
+	inline bool Alloc( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock );
+	inline void Free( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock );
+	inline uint32 Reclaim( bool bForce = false );
+	inline void Compact();
+
+	// Helper methods
+	inline LocalMemoryAllocation_t * FindFreeBlock( uint32 uiAlignBytes, uint32 uiSize );
+	inline bool IsFenceCompleted( uint32 uiCurrentFenceValue, uint32 uiCheckStoredFenceValue );
+	inline void TrackAllocStats( CPs3gcmAllocationType_t uAllocType, int nDelta );
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	inline void ValidateAllBlocks();
+#endif
+}
+g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolCount];
+uint32 CPs3gcmLocalMemoryAllocator::sm_uiFenceNumber;
+uint32 volatile * CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation;
+
+// RSX memory usage stats tracking:
+static GPUMemoryStats g_RsxMemoryStats;
+struct GPUMemoryStats_Pool
+{
+	int nDefaultPoolSize;
+	int nDefaultPoolUsed;
+	int nRTPoolUsed;
+	int nDynamicPoolUsed;
+	int nMainMemUsed;
+	int nUnknownPoolUsed;
+};
+GPUMemoryStats_Pool g_RsxMemoryStats_Pool;
+
+static inline uint32 Ps3gcmHelper_ComputeTiledAreaMemorySize( uint32 nCount, uint32 w, uint32 h, uint32 bpp )
+{
+	uint32 nTilePitch = cellGcmGetTiledPitchSize( w * bpp );
+	uint32 uiSize = nTilePitch * AlignValue( h, 32 );
+	uiSize *= nCount;
+	uiSize = AlignValue( uiSize, PS3GCMALLOCATIONALIGN( kAllocPs3gcmColorBufferMisc ) );
+	return uiSize;
+}
+
+void Ps3gcmLocalMemoryAllocator_Init()
+{
+	PS3ALLOCMTX
+
+	if ( !CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation )
+	{
+		CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation = cellGcmGetLabelAddress( GCM_LABEL_MEMORY_FREE );
+		*CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation = 0;
+	}
+	
+	// Pool boundaries
+	uint32 uiGcmAllocBegin = g_ps3gcmGlobalState.m_nLocalBaseOffset;
+	uint32 uiGcmAllocEnd = uiGcmAllocBegin + g_ps3gcmGlobalState.m_nLocalSize;
+
+	// Memory should be allocated for large frame buffers
+	uint32 uiMemorySizeBuffer[2] = { MAX( 1280, g_ps3gcmGlobalState.m_nRenderSize[0] ), MAX( 720, g_ps3gcmGlobalState.m_nRenderSize[1] ) };
+	uint32 uiFactor[2] = { uiMemorySizeBuffer[0]*uiMemorySizeBuffer[1], 1280*720 };
+
+	// Configuration of pool memory (can be #ifdef'd for every game)
+	static const uint32 s_PoolMemoryLayout[/*kGcmAllocPoolCount*/] =
+	{
+#if defined( CSTRIKE15 )
+		// mhansen - We had to adjust the memory values a bit for cstrike15 to get a map to load
+		// PS3_BUILDFIX - We need to revisit this to determine the proper size later on
+		// mdonofrio - render target allocations revisited for PS3
+		// potential to save some more (~12Mb) from TiledColourFB (only need two really.
+		// wait for other rendering optimisation/rework to be finished first before attempting.
+		
+		/*kGcmAllocPoolDefault			= */	0,
+		/*kGcmAllocPoolDynamicNewPath	= */	5 * 1024 * 1024,	// 5 MB
+		/*kGcmAllocPoolDynamic			= */	11 * 1024 * 1024,	// 11 MB
+		/*kGcmAllocPoolTiledColorFB		= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2 + CPs3gcmDisplay::SURFACE_COUNT, uiMemorySizeBuffer[0], uiMemorySizeBuffer[1], 4 ),	// 3 buffers allocated in CreateRSXBuffers + 2 _rt_fullFrameFB - can probably get this down to 2 if we 1. don't use MLAA and 2. we clean up the post-pro rendering to use the front buffer as a textureand 3. tidy up aliasing for rt_fullframeFB and rt_fullFrameFB1
+		/*kGcmAllocPoolTiledColorFBQ	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2, uiMemorySizeBuffer[0]/4, uiMemorySizeBuffer[1]/4, 4 ),	// fits 2 1/4 size framebuffer textures
+		/*kGcmAllocPoolTiledColor512	= */	0,
+		/*kGcmAllocPoolTiledColorMisc	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 1, 640, 640, 4 ) + Ps3gcmHelper_ComputeTiledAreaMemorySize( 2, 1024, 512, 4) + Ps3gcmHelper_ComputeTiledAreaMemorySize(1, 32, 32, 4), // //  1x 1/2 size smoke/fog buffer, 2xWater(1024x512x32bpp), EyeGlint(32x32x32bpp), *Monitor(256x256x32bpp), *RTTFlashlightShadows(864x864x8bpp) - * we don't need these for CS15
+		/*kGcmAllocPoolTiledD24S8		= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2, uiMemorySizeBuffer[0], uiMemorySizeBuffer[1], 4 ), // only 2 depth buffer targets required (current and saved off), + reserve space for 1/2 size depth buffer for smoke/fog  
+		/*kGcmAllocPoolMainMemory		= */	0,	// configured based on mapped IO memory
+		/*kGcmAllocPoolMallocMemory		= */	0,	// using malloc
+#else
+		/*kGcmAllocPoolDefault			= */	0,
+		/*kGcmAllocPoolDynamicNewPath	= */	5 * 1024 * 1024,	// 5 MB
+		/*kGcmAllocPoolDynamic			= */	10 * 1024 * 1024,	// 10 MB
+		/*kGcmAllocPoolTiledColorFB		= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2 * CPs3gcmDisplay::SURFACE_COUNT, uiMemorySizeBuffer[0], uiMemorySizeBuffer[1], 4 ),	// fits 6	of full framebuffer textures
+		/*kGcmAllocPoolTiledColorFBQ	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 4, uiMemorySizeBuffer[0]/4, uiMemorySizeBuffer[1]/4, 4 ),	// fits 4	quarters of framebuffer textures
+		/*kGcmAllocPoolTiledColor512	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2, 512, 512, 4 ),	// fits 2   512x512 RGBA textures
+		/*kGcmAllocPoolTiledColorMisc	= */	5 * 1024 * 1024,	//  5 MB
+		/*kGcmAllocPoolTiledD24S8		= */	uint64( 15 * 1024 * 1024 ) * uiFactor[0]/uiFactor[1],	// 15 MB
+		/*kGcmAllocPoolMainMemory		= */	0,	// configured based on mapped IO memory
+		/*kGcmAllocPoolMallocMemory		= */	0,	// using malloc
+#endif
+	};
+	COMPILE_TIME_ASSERT( ARRAYSIZE( s_PoolMemoryLayout ) == ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ) );
+
+	for ( int j = ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); j -- > 0; )
+	{
+		const uint32 uiSize = AlignValue( s_PoolMemoryLayout[j], 1024 * 1024 ); // Align it on 1 MB boundaries, all our pools are large
+		g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMax = uiGcmAllocEnd;
+		uiGcmAllocEnd -= uiSize;
+		g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMin =
+			g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetUnallocated = uiGcmAllocEnd;
+	}
+
+	// Default pool setup (rest of local memory)
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMax = uiGcmAllocEnd;
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMin =
+		g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetUnallocated = uiGcmAllocBegin;
+
+	// Main memory mapped pool
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetMin =
+		g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetUnallocated = uint32( g_ps3gcmGlobalState.m_pRsxMainMemoryPoolBuffer ) + g_ps3gcmGlobalState.m_nIoOffsetDelta;
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetMax = g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetMin + g_ps3gcmGlobalState.m_nRsxMainMemoryPoolBufferSize;
+
+	// Store initial capacity for memory stats tracking:
+	g_RsxMemoryStats.nGPUMemSize = g_ps3gcmGlobalState.m_nLocalSize;
+	g_RsxMemoryStats_Pool.nDefaultPoolSize = g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMin;
+
+	//
+	// Setup preset tiled regions
+	//
+	{
+		CPs3gcmAllocationPool_t ePool = kGcmAllocPoolTiledColorFB;
+		uint8 uiBank = 0;	// bank 0..3
+		uint32 nRenderPitch = cellGcmGetTiledPitchSize( g_ps3gcmGlobalState.m_nRenderSize[0] * 4 );
+		uint8 uiTileIndex = ePool - kGcmAllocPoolTiledColorFB;
+		cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			nRenderPitch, CELL_GCM_COMPMODE_DISABLED,
+			( g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolTiledColorFB ].m_nOffsetMin ) / 0x10000, // The area base + size/0x10000 will be allocated as the tag area.
+			uiBank );
+		cellGcmBindTile( uiTileIndex );
+	}
+	{
+		CPs3gcmAllocationPool_t ePool = kGcmAllocPoolTiledColorFBQ;
+		uint8 uiBank = 1;	// bank 0..3
+		uint32 nRenderPitch = cellGcmGetTiledPitchSize( g_ps3gcmGlobalState.m_nRenderSize[0] * 4 / 4 );
+		uint8 uiTileIndex = ePool - kGcmAllocPoolTiledColorFB;
+		cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			nRenderPitch, CELL_GCM_COMPMODE_DISABLED,
+			( g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolTiledColorFB ].m_nOffsetMin ) / 0x10000, // The area base + size/0x10000 will be allocated as the tag area.
+			uiBank );
+		cellGcmBindTile( uiTileIndex );
+	}
+	{
+		CPs3gcmAllocationPool_t ePool = kGcmAllocPoolTiledColor512;
+		uint8 uiBank = 2;	// bank 0..3
+		uint32 nRenderPitch = cellGcmGetTiledPitchSize( 512 * 4 );
+		uint8 uiTileIndex = ePool - kGcmAllocPoolTiledColorFB;
+		cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			nRenderPitch, CELL_GCM_COMPMODE_DISABLED,
+			( g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolTiledColorFB ].m_nOffsetMin ) / 0x10000, // The area base + size/0x10000 will be allocated as the tag area.
+			uiBank );
+		cellGcmBindTile( uiTileIndex );
+	}
+
+#ifndef _CERT
+	static const char * s_PoolMemoryNames[] =
+	{
+		/*kGcmAllocPoolDefault			= */	"Default Pool",
+		/*kGcmAllocPoolDynamicNewPath	= */	"Dynamic New ",
+		/*kGcmAllocPoolDynamic			= */	"Dynamic IBVB",
+		/*kGcmAllocPoolTiledColorFB		= */	"FullFrameRTs",
+		/*kGcmAllocPoolTiledColorFBQ	= */	"1/4Frame RTs",
+		/*kGcmAllocPoolTiledColor512	= */	"512x512 RTs ",
+		/*kGcmAllocPoolTiledColorMisc	= */	"All Misc RTs",
+		/*kGcmAllocPoolTiledD24S8		= */	"DepthStencil",
+		/*kGcmAllocPoolMainMemory		= */	"Main Memory ",
+		/*kGcmAllocPoolMallocMemory		= */	"MallocMemory",
+	};
+	COMPILE_TIME_ASSERT( ARRAYSIZE( s_PoolMemoryNames ) == ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ) );
+
+	Msg( "RSX Local Memory layout:\n" );
+	for ( int j = 0; j < ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); ++ j )
+	{
+		Msg( "    %s    0x%08X - 0x%08X   [ %9.3f MB ]\n",
+			s_PoolMemoryNames[j],
+			g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMax,
+			(g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMin) / 1024.f / 1024.f );
+	}
+	Msg( "Total size: %d MB\n", g_ps3gcmGlobalState.m_nLocalSize / 1024 / 1024 );
+#endif
+}
+
+void Ps3gcmLocalMemoryAllocator_Reclaim()
+{
+	PS3ALLOCMTX
+	for ( int k = 0; k < ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); ++ k )
+		g_ps3gcmLocalMemoryAllocator[ k ].Reclaim();
+}
+
+void Ps3gcmLocalMemoryAllocator_Compact()
+{
+#define PS3GCMCOMPACTPROFILE 0
+#if PS3GCMCOMPACTPROFILE
+	float flTimeStart = Plat_FloatTime();
+	uint32 uiFree = g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated;
+#endif
+
+	// Let RSX wait for final flip
+	GCM_FUNC( cellGcmSetWaitFlip );
+
+	// Let PPU wait for all RSX commands done (include waitFlip)
+	g_ps3gcmGlobalState.CmdBufferFinish();
+
+#if PS3GCMCOMPACTPROFILE
+	float flTimeWait = Plat_FloatTime() - flTimeStart;
+#endif
+
+	{
+		PS3ALLOCMTX
+		for ( int k = 0; k < ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); ++ k )
+		{
+			g_ps3gcmLocalMemoryAllocator[ k ].Compact();
+		}
+	}
+	
+#if PS3GCMCOMPACTPROFILE
+	float flTimePrepareTransfer = Plat_FloatTime() - flTimeStart;
+#endif
+
+	// Wait for all RSX memory to be transferred
+	g_ps3gcmGlobalState.CmdBufferFinish();
+
+#if PS3GCMCOMPACTPROFILE
+	float flTimeDone = Plat_FloatTime() - flTimeStart;
+	char chBuffer[64];
+	Q_snprintf( chBuffer, ARRAYSIZE( chBuffer ), "COMPACT: %0.3f / %0.3f / %0.3f sec\n",
+		flTimeWait, flTimePrepareTransfer, flTimeDone );
+	uint32 dummy;
+	sys_tty_write( SYS_TTYP6, chBuffer, Q_strlen( chBuffer ), &dummy );
+	Q_snprintf( chBuffer, ARRAYSIZE( chBuffer ), "COMPACT: %0.3f -> %0.3f MB (%0.3f MB free)\n",
+		uiFree / 1024.f / 1024.f, g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated / 1024.f / 1024.f,
+		(g_ps3gcmLocalMemoryAllocator[0].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f );
+	sys_tty_write( SYS_TTYP6, chBuffer, Q_strlen( chBuffer ), &dummy );
+#endif
+}
+
+void Ps3gcmLocalMemoryAllocator_CompactWithReason( char const *szReason )
+{
+	double flTimeCompactStart = Plat_FloatTime();
+	DevMsg( "====== GCM LOCAL MEMORY COMPACT : %s =====\n", szReason );
+	uint32 uiFreeMemoryBeforeCompact = g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated;
+	DevMsg( "RSX Local Memory Free:  %0.3f MB; compacting...\n", (g_ps3gcmLocalMemoryAllocator[0].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f );
+
+	Ps3gcmLocalMemoryAllocator_Compact();
+
+	DevMsg( "RSX Local Memory Compacted %0.3f MB in %0.3f sec\n",
+		(uiFreeMemoryBeforeCompact - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f,
+		Plat_FloatTime() - flTimeCompactStart );
+	DevMsg( "RSX Local Memory Free:  %0.3f MB\n", (g_ps3gcmLocalMemoryAllocator[0].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f );
+}
+
+
+bool CPs3gcmLocalMemoryBlock::Alloc()
+{
+	PS3ALLOCMTX
+	return g_ps3gcmLocalMemoryAllocator[PS3GCMALLOCATIONPOOL(m_uType)].Alloc( reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( this ) );
+}
+
+void CPs3gcmLocalMemoryBlock::Free()
+{
+	PS3ALLOCMTX
+	g_ps3gcmLocalMemoryAllocator[PS3GCMALLOCATIONPOOL(m_uType)].Free( reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( this ) );
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Private implementation of PS3 local memory allocator
+//
+
+inline bool CPs3gcmLocalMemoryAllocator::Alloc( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock )
+{
+	TrackAllocStats( pBlock->MutableType(), pBlock->MutableSize() );
+
+	uint32 uAlignBytes = PS3GCMALLOCATIONALIGN( pBlock->MutableType() );
+	Assert( IsPowerOfTwo( uAlignBytes ) );
+
+	double flAllocatorStallTime = 0.0f;
+	bool bCompactPerformed = true;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	bCompactPerformed = !r_ps3_gcmlowcompact.GetBool();
+#endif
+
+retry_allocation:
+	// Try to find a free block
+	if ( LocalMemoryAllocation_t *pFreeBlock = FindFreeBlock( uAlignBytes, pBlock->MutableSize() ) )
+	{
+		pBlock->MutableOffset() = pFreeBlock->m_block.MutableOffset();
+		pBlock->MutableIndex() = pFreeBlock->m_block.MutableIndex();
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+		if ( m_arrAllocations[ pBlock->MutableIndex() ] != &pFreeBlock->m_block )
+			Error( "<vitaliy> GCM Local Memory Allocator Error (attempt to reuse invalid free block)!" );
+#endif
+		m_arrAllocations[ pBlock->MutableIndex() ] = reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( pBlock );
+		delete pFreeBlock;
+	}
+	else if ( this != &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMallocMemory ] )
+	{
+		// Allocate new block
+		uint32 uiOldUnallocatedEdge = m_nOffsetUnallocated;
+		uint32 uiFreeBlock = ( m_nOffsetUnallocated + uAlignBytes - 1 ) & ~( uAlignBytes - 1 );
+
+		// Check if there's enough space in this pool for the requested block
+		if ( uiFreeBlock + pBlock->MutableSize() > m_nOffsetMax )
+		{
+			// There's not enough space in this pool
+			if ( m_pPendingFreeBlock )
+			{
+				// There are pending free blocks, we just need to wait for
+				// RSX to finish rendering using them
+				if ( !flAllocatorStallTime )
+				{
+					flAllocatorStallTime = Plat_FloatTime();
+					g_ps3gcmGlobalState.CmdBufferFlush( CPs3gcmGlobalState::kFlushForcefully );
+				}
+				while ( Reclaim() < pBlock->MutableSize() && m_pPendingFreeBlock )
+				{
+					ThreadSleep( 1 );
+				}
+				goto retry_allocation;
+			}
+			else if ( !bCompactPerformed )
+			{
+				// Let PPU wait for all RSX commands done
+				g_ps3gcmGlobalState.CmdBufferFinish();
+
+				uint32 uiFragmentedFreeSpace = m_nOffsetMax - m_nOffsetUnallocated;
+				for ( LocalMemoryAllocation_t *pFreeFragment = m_pFreeBlock; pFreeFragment; pFreeFragment = pFreeFragment->m_pNext )
+					uiFragmentedFreeSpace += pFreeFragment->m_block.MutableSize();
+				Warning(
+					"**************** GCM LOCAL MEMORY LOW *****************\n"
+					"<vitaliy> GCM Local Memory Allocator#%d pool compacting!\n"
+					"  Requested allocation %u bytes.\n"
+					"  Pool capacity %u bytes.\n"
+					"  Free fragmented space %u bytes.\n"
+					"  Unallocated %u bytes.\n"
+					"  Used %u bytes.\n",
+					this - g_ps3gcmLocalMemoryAllocator,
+					( uint32 ) pBlock->MutableSize(),
+					m_nOffsetMax - m_nOffsetMin,
+					uiFragmentedFreeSpace,
+					m_nOffsetMax - m_nOffsetUnallocated,
+					m_nOffsetUnallocated - m_nOffsetMin
+					);
+				Compact();
+				Warning( "  ---> Compacted pool#%d has %u unallocated bytes.\n",
+					this - g_ps3gcmLocalMemoryAllocator,
+					m_nOffsetMax - m_nOffsetUnallocated );
+				bCompactPerformed = true;
+
+				// Wait for all RSX memory to be transferred
+				g_ps3gcmGlobalState.CmdBufferFinish();
+				goto retry_allocation;
+			}
+			else
+			{
+				// Main memory pool returns failure so caller can try local pool.
+
+				if (this == &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ]) return false;
+
+				uint32 uiFragmentedFreeSpace = m_nOffsetMax - m_nOffsetUnallocated;
+				for ( LocalMemoryAllocation_t *pFreeFragment = m_pFreeBlock; pFreeFragment; pFreeFragment = pFreeFragment->m_pNext )
+					uiFragmentedFreeSpace += pFreeFragment->m_block.MutableSize();
+				Error(
+					"********* OUT OF GCM LOCAL MEMORY ********************\n"
+					"<vitaliy> GCM Local Memory Allocator#%d pool exhausted!\n"
+					"  Failed allocation %u bytes.\n"
+					"  Pool capacity %u bytes.\n"
+					"  Free fragmented space %u bytes.\n"
+					"  Unallocated %u bytes.\n"
+					"  Used %u bytes.\n",
+					this - g_ps3gcmLocalMemoryAllocator,
+					( uint32 ) pBlock->MutableSize(),
+					m_nOffsetMax - m_nOffsetMin,
+					uiFragmentedFreeSpace,
+					m_nOffsetMax - m_nOffsetUnallocated,
+					m_nOffsetUnallocated - m_nOffsetMin
+					);
+			}
+		}
+
+		// update the pointer to "unallocated" realm
+		m_nOffsetUnallocated = uiFreeBlock + pBlock->MutableSize();
+
+		// this is the last allocation so far
+		pBlock->MutableIndex() = m_arrAllocations.AddToTail( reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( pBlock ) );
+		pBlock->MutableOffset() = uiFreeBlock;
+	}
+	else
+	{
+		MEM_ALLOC_CREDIT_( "GCM Malloc Pool" );
+		void *pvMallocMemory = MemAlloc_AllocAligned( pBlock->MutableSize(), uAlignBytes );
+		pBlock->MutableOffset() = (uint32) pvMallocMemory;
+		pBlock->MutableIndex() = ~0;
+	}
+
+	if ( flAllocatorStallTime )
+		g_ps3gcmGlobalState.m_flAllocatorStallTimeWaitingRSX += Plat_FloatTime() - flAllocatorStallTime;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	// PS3 doesn't allow more than 8 zcull regions (index 0..7)
+	if ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_arrAllocations.Count() > 8 )
+		Error( "PS3 number of zcull regions exceeded!\n" );
+	// PS3 doesn't allow more than 15 tiles regions (index 0..14)
+	if ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_arrAllocations.Count() +
+			g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorMisc].m_arrAllocations.Count() +
+			( kGcmAllocPoolTiledColorMisc - kGcmAllocPoolTiledColorFB )
+			> 15 )
+		Error( "PS3 number of tiled regions exceeded!\n" );
+	pBlock->m_dbgGuardCookie = g_GcmLocalMemoryBlockDebugCookieAllocated;
+#endif
+
+	return true;
+
+}
+
+inline void CPs3gcmLocalMemoryAllocator::Free( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock )
+{
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	if ( !pBlock ||
+		pBlock->m_dbgGuardCookie != g_GcmLocalMemoryBlockDebugCookieAllocated ||
+		( ( pBlock->MutableIndex() != ~0 ) && ( m_arrAllocations[ pBlock->MutableIndex() ] != pBlock ) ) )
+		{
+			//DebuggerBreak();
+			Error( "<vitaliy> Attempt to free not allocated GCM local memory block!" );
+		}
+	pBlock->m_dbgGuardCookie = g_GcmLocalMemoryBlockDebugCookieFree;
+#endif
+
+	LocalMemoryAllocation_t *pDealloc = new LocalMemoryAllocation_t;
+	pDealloc->m_block = *pBlock;
+	pDealloc->m_uiFenceNumber = ++ sm_uiFenceNumber;
+	pDealloc->m_pNext = m_pPendingFreeBlock;
+	GCM_FUNC( cellGcmSetWriteBackEndLabel, GCM_LABEL_MEMORY_FREE, sm_uiFenceNumber );
+	m_pPendingFreeBlock = pDealloc;
+
+	TrackAllocStats( pBlock->MutableType(), - pBlock->MutableSize() );
+	if ( pBlock->MutableIndex() != ~0 )
+	{
+		#ifdef GCMLOCALMEMORYBLOCKDEBUG
+		if ( m_arrAllocations[ pBlock->MutableIndex() ] != pBlock )
+			Error( "<vitaliy> GCM Local Memory Allocator Error (freeing block that is not properly registered)!" );
+		#endif
+		m_arrAllocations[ pBlock->MutableIndex() ] = &pDealloc->m_block;
+	}
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	pBlock->MutableOffset() = ~0;
+	pBlock->MutableIndex() = ~0;
+#endif
+}
+
+inline bool CPs3gcmLocalMemoryAllocator::IsFenceCompleted( uint32 uiCurrentFenceValue, uint32 uiCheckStoredFenceValue )
+{
+#if GCM_ALLOW_NULL_FLIPS
+	extern bool g_ps3_nullflips;
+	if ( g_ps3_nullflips )
+		return true;
+#endif
+	// Needs to handle the counter wrapping around
+	return ( ( uiCurrentFenceValue - m_uiFenceLastKnown ) >= ( uiCheckStoredFenceValue - m_uiFenceLastKnown ) );
+}
+
+inline uint32 CPs3gcmLocalMemoryAllocator::Reclaim( bool bForce )
+{
+	uint32 uiLargestBlockSizeReclaimed = 0;
+	uint32 uiCurrentFenceValue = *sm_puiFenceLocation;
+
+	// Walk pending free blocks and see if they are no longer
+	// in use by RSX:
+	LocalMemoryAllocation_t **p = &m_pPendingFreeBlock;
+	if ( !bForce ) while ( (*p) && !IsFenceCompleted( uiCurrentFenceValue, (*p)->m_uiFenceNumber ) )
+		p = &( (*p)->m_pNext );
+
+	// Now p is pointing to the chain of free blocks
+	// chain that has been completed (due to the nature of
+	// pushing new deallocation at the head of the pending
+	// list)
+	if ( *p )
+	{
+		LocalMemoryAllocation_t *pCompletedChain = *p;
+		*p = NULL;	// Terminate the chain
+
+		// Handle the special case of malloc reclaim - free all memory
+		if ( this == &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMallocMemory ] )
+		{
+			MEM_ALLOC_CREDIT_( "GCM Malloc Pool" );
+			for ( LocalMemoryAllocation_t *pActualFree = pCompletedChain; pActualFree; )
+			{
+				MemAlloc_FreeAligned( pActualFree->m_block.DataInMallocMemory() );
+				LocalMemoryAllocation_t *pDelete = pActualFree;
+				pActualFree = pActualFree->m_pNext;
+				delete pDelete;
+			}
+			pCompletedChain = NULL;
+		}
+
+		// Relink the completed pending chain into
+		// the free blocks chain
+		LocalMemoryAllocation_t **ppFree = &m_pFreeBlock;
+		while ( *ppFree )
+			ppFree = &( (*ppFree)->m_pNext );
+		*ppFree = pCompletedChain;
+
+		// Recompute actual free sizes of the completed chain
+		// Actual free size is the delta between block offset and next block offset
+		// When there's no next block then its delta between block offset and unallocated edge
+		for ( LocalMemoryAllocation_t *pActualFree = pCompletedChain; pActualFree; pActualFree = pActualFree->m_pNext )
+		{
+			uint32 uiIdx = pActualFree->m_block.MutableIndex() + 1;
+			uint32 uiNextOffset = m_nOffsetUnallocated;
+			if ( uiIdx < m_arrAllocations.Count() )
+			{
+				CPs3gcmLocalMemoryBlockMutable * RESTRICT pNextBlock = m_arrAllocations[ uiIdx ];
+				uiNextOffset = pNextBlock->Offset();
+			}
+			uint32 uiActualBlockSize = uiNextOffset - pActualFree->m_block.Offset();
+			pActualFree->m_block.MutableSize() = uiActualBlockSize;
+			uiLargestBlockSizeReclaimed = MAX( uiLargestBlockSizeReclaimed, uiActualBlockSize );
+		}
+	}
+
+	// Remember the last known fence value
+	m_uiFenceLastKnown = uiCurrentFenceValue;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	ValidateAllBlocks();
+#endif
+
+	return uiLargestBlockSizeReclaimed;
+}
+
+inline CPs3gcmLocalMemoryAllocator::LocalMemoryAllocation_t * CPs3gcmLocalMemoryAllocator::FindFreeBlock( uint32 uiAlignBytes, uint32 uiSize )
+{
+	LocalMemoryAllocation_t **ppBest = NULL;
+	uint32 uiSizeMax = uiSize * 11/10;	// we don't want to inflate requested size by > 10%
+	for ( LocalMemoryAllocation_t **p = &m_pFreeBlock;
+		(*p);
+		p = &( (*p)->m_pNext ) )
+	{
+		if ( (*p)->m_block.MutableSize() >= uiSize && (*p)->m_block.MutableSize() <= uiSizeMax &&
+			!( (*p)->m_block.Offset() & ( uiAlignBytes - 1 ) ) )
+		{
+			if ( !ppBest || ( (*p)->m_block.MutableSize() <= (*ppBest)->m_block.MutableSize() ) )
+			{
+				ppBest = p;
+			}
+		}
+	}
+	if ( ppBest )
+	{
+		LocalMemoryAllocation_t *pFree = (*ppBest);
+		(*ppBest) = pFree->m_pNext;
+		pFree->m_pNext = NULL;
+		return pFree;
+	}
+	return NULL;
+}
+
+inline bool TrackAllocStats_Pool( CPs3gcmAllocationType_t uAllocType, int nDelta )
+{
+	CPs3gcmAllocationPool_t pool = PS3GCMALLOCATIONPOOL( uAllocType );
+	int *stat = &g_RsxMemoryStats_Pool.nUnknownPoolUsed;
+	bool bInRSXMem = true;
+	switch( pool )
+	{
+	case kGcmAllocPoolDefault:
+		stat = &g_RsxMemoryStats_Pool.nDefaultPoolUsed;
+		break;
+	case kGcmAllocPoolDynamicNewPath:
+	case kGcmAllocPoolDynamic:
+		stat = &g_RsxMemoryStats_Pool.nDynamicPoolUsed;
+		break;
+	case kGcmAllocPoolTiledColorFB:
+	case kGcmAllocPoolTiledColorFBQ:
+	case kGcmAllocPoolTiledColor512:
+	case kGcmAllocPoolTiledColorMisc:
+	case kGcmAllocPoolTiledD24S8:
+		stat = &g_RsxMemoryStats_Pool.nRTPoolUsed;
+		break;
+	case kGcmAllocPoolMainMemory: // Unused, unless PS3GCM_VBIB_IN_IO_MEMORY set to 1
+	case kGcmAllocPoolMallocMemory:
+		stat = &g_RsxMemoryStats_Pool.nMainMemUsed;
+		bInRSXMem = false; // In main memory!
+		break;
+	}
+	*stat += nDelta;
+	Assert( 0 <= (int)*stat );
+
+	// Report free memory only from the default pool (the other pools are pre-sized to fixed limits, and all
+	// geom/textures go into the default pool, so that's where content-driven variation/failures will occur)
+	g_RsxMemoryStats.nGPUMemFree = g_RsxMemoryStats_Pool.nDefaultPoolSize - g_RsxMemoryStats_Pool.nDefaultPoolUsed;
+
+	return bInRSXMem;
+}
+
+inline void CPs3gcmLocalMemoryAllocator::TrackAllocStats( CPs3gcmAllocationType_t uAllocType, int nDelta )
+{
+#if TRACK_ALLOC_STATS
+	// Early-out for allocations not in RSX memory:
+	if ( !TrackAllocStats_Pool( uAllocType, nDelta ) )
+		return;
+
+	unsigned int *stat = &g_RsxMemoryStats.nUnknown;
+	switch( uAllocType )
+	{
+	case kAllocPs3gcmColorBufferMisc:
+	case kAllocPs3gcmColorBufferFB:
+	case kAllocPs3gcmColorBufferFBQ:
+	case kAllocPs3gcmColorBuffer512:
+	case kAllocPs3gcmDepthBuffer:
+		stat = &g_RsxMemoryStats.nRTSize;
+		break;
+	case kAllocPs3gcmTextureData:
+	case kAllocPs3gcmTextureData0:
+		stat = &g_RsxMemoryStats.nTextureSize;
+		break;
+	case kAllocPs3GcmVertexBuffer:
+		stat = &g_RsxMemoryStats.nVBSize;
+		break;
+	case kAllocPs3GcmIndexBuffer:
+		stat = &g_RsxMemoryStats.nIBSize;
+		break;
+
+	case kAllocPs3GcmShader:
+	case kAllocPs3GcmEdgeGeomBuffer:
+	case kAllocPs3GcmVertexBufferDynamic:
+	case kAllocPs3GcmIndexBufferDynamic:
+	case kAllocPs3GcmDynamicBufferPool:
+	case kAllocPs3GcmVertexBufferDma:
+	case kAllocPs3GcmIndexBufferDma:
+		// Treat these as misc unless they become big/variable
+		break;
+	}
+	*stat += nDelta;
+	Assert( 0 <= (int)*stat );
+#endif // TRACK_ALLOC_STATS
+}
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+#define VALIDATECONDITION( x ) if( !( x ) ) { Error( "<vitaliy> GCM Local Memory Allocation block %p index %d is corrupt [line %d]!\n", pBlock, k, __LINE__ ); }
+inline void CPs3gcmLocalMemoryAllocator::ValidateAllBlocks()
+{
+	// Traverse the allocated list and validate debug guards and patch-back indices
+	CUtlVector< uint32 > arrFreeBlocksIdx;
+	uint32 uiLastAllocatedOffset = m_nOffsetMin;
+	for ( int k = 0, kEnd = m_arrAllocations.Count(); k < kEnd; ++ k )
+	{
+		CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock = m_arrAllocations[k];
+		VALIDATECONDITION( pBlock );
+		VALIDATECONDITION( pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieAllocated || pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieFree );
+		VALIDATECONDITION( pBlock->MutableIndex() < m_arrAllocations.Count() );
+		VALIDATECONDITION( pBlock->MutableIndex() == k );
+		VALIDATECONDITION( m_arrAllocations[ pBlock->MutableIndex() ] == pBlock );
+		VALIDATECONDITION( pBlock->Offset() >= uiLastAllocatedOffset );
+		uiLastAllocatedOffset = pBlock->Offset() + pBlock->MutableSize();
+		VALIDATECONDITION( uiLastAllocatedOffset <= m_nOffsetMax );
+		if ( pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieFree )
+			arrFreeBlocksIdx.AddToTail( k );
+	}
+	
+	// Traverse free lists and validate
+	LocalMemoryAllocation_t * arrFree[] = { m_pPendingFreeBlock, m_pFreeBlock };
+	for ( int j = 0; j < ARRAYSIZE( arrFree ); ++ j )
+	for ( LocalMemoryAllocation_t *p = arrFree[j]; p; p = p->m_pNext )
+	{
+		int k = j;
+		CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock = &p->m_block;
+		VALIDATECONDITION( pBlock );
+		VALIDATECONDITION( pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieFree );
+		k = pBlock->MutableIndex();
+		if ( pBlock->MutableIndex() != ~0 )
+		{
+			VALIDATECONDITION( pBlock->MutableIndex() < m_arrAllocations.Count() );
+			VALIDATECONDITION( m_arrAllocations[ pBlock->MutableIndex() ] == pBlock );
+			VALIDATECONDITION( arrFreeBlocksIdx.FindAndFastRemove( pBlock->MutableIndex() ) );
+		}
+	}
+
+	int k = 0;
+	void *pBlock = 0;
+	VALIDATECONDITION( !arrFreeBlocksIdx.Count() );
+}
+#endif
+
+inline void CPs3gcmLocalMemoryAllocator::Compact()
+{
+	GCM_PERF_PUSH_MARKER( "LocalMemory:Compact" );
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	ValidateAllBlocks();
+
+	if ( r_ps3_gcmnocompact.GetBool() )
+		return;
+#endif
+
+	// Reclaim all memory (NOTE: all pending blocks must be reclaimed since both RSX and PPU have stopped rendering!)
+	Reclaim();
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	if ( m_pPendingFreeBlock )
+		Warning( "GCM Local Memory Allocator Compact forces pending free blocks to be reclaimed.\n" );
+	ValidateAllBlocks();
+#endif
+
+	if ( m_pPendingFreeBlock )
+		Reclaim( true );
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	if ( m_pPendingFreeBlock )
+		Error( "<vitaliy> GCM Local Memory Allocator Compact requires RSX and PPU rendering to be paused! (pending free blocks have not been reclaimed)\n" );
+	ValidateAllBlocks();
+#endif
+
+	// Walk the free blocks chain and patch-back NULL pointers into allocation tracking system
+	while ( m_pFreeBlock )
+	{
+		LocalMemoryAllocation_t *p = m_pFreeBlock;
+		m_pFreeBlock = p->m_pNext;
+		m_arrAllocations[ p->m_block.MutableIndex() ] = NULL;
+		delete p;
+	}
+	Assert( !m_pFreeBlock && !m_pPendingFreeBlock );
+
+	// These are elements requiring reallocation
+	uint32 uiCount = m_arrAllocations.Count();
+	CPs3gcmLocalMemoryBlockMutable **pReallocationBlocks = m_arrAllocations.Base();
+
+	// Here "correct" implementation would be to copy off m_arrAllocations vector onto stack for iteration,
+	// RemoveAll from m_arrAllocations vector and allocate all blocks again.
+	// We will cheat since we know that we will allocate same number of elements and directly write zero
+	// into m_arrAllocations m_Size member, then we will still be able to use the memory of the vector
+	// for reading blocks requiring compact reallocation, and AddToTail will still fill the vector with
+	// correct data.
+	struct AllocatorCompactVectorCheat : public CUtlVector< CPs3gcmLocalMemoryBlockMutable * > { inline void ResetCountPreservingMemoryContents() { m_Size = 0; } };
+	( ( AllocatorCompactVectorCheat * ) ( char * ) &m_arrAllocations )->ResetCountPreservingMemoryContents();
+	m_nOffsetUnallocated = m_nOffsetMin;
+
+	// Prepare RSX for data buffer transfers in local memory
+	uint nTransferMode = ( ( this - &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ] ) < kGcmAllocPoolMainMemory ) ? CELL_GCM_TRANSFER_LOCAL_TO_LOCAL : CELL_GCM_TRANSFER_MAIN_TO_MAIN;
+	Assert( nTransferMode < 4 );
+	GCM_FUNC( cellGcmSetTransferDataMode, nTransferMode ); // unnecessary if we do this on SPU
+
+	Assert( !g_spuGcm.IsDeferredDrawQueue() );
+
+	// Reallocate all blocks
+	for ( ; uiCount; -- uiCount, ++ pReallocationBlocks )
+	{
+		CPs3gcmLocalMemoryBlockMutable *pBlock = *pReallocationBlocks;
+		if ( !pBlock )
+			continue;
+
+		uint32 nOldOffset = pBlock->Offset();
+
+		TrackAllocStats( pBlock->MutableType(), - pBlock->MutableSize() );
+		Alloc( pBlock );
+
+		if ( nOldOffset == pBlock->Offset() )
+			continue;
+
+		// Have RSX transfer blocks data. RSX may hang if there's WriteLabel between the Format and Offset commands,
+		// so reserve space for both of them up front
+		SpuDrawTransfer_t * pTransfer = g_spuGcm.GetDrawQueue()->AllocWithHeader<SpuDrawTransfer_t>( SPUDRAWQUEUE_TRANSFER_METHOD | nTransferMode );
+		pTransfer->m_nLineSize = pBlock->MutableSize();
+		pTransfer->m_nOldOffset = nOldOffset;
+		pTransfer->m_nNewOffset = pBlock->Offset();
+	}
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	ValidateAllBlocks();
+#endif
+	GCM_PERF_MARKER( "Compact:Complete" );
+}
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Computation of tiled memory
+//
+
+uint32 CPs3gcmLocalMemoryBlock::TiledMemoryTagAreaBase() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledColorMisc )	// Misc color tiles are placed at the front of tag area after preset pools
+		return ( Offset() - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are placed in the end of tag area (0-0x7FF is offset range)
+		return 0x800 - ( Offset() - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_nOffsetMin + m_uiSize ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledColorFB )	// FB color tiles go first
+		return ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledColorFBQ )	// FBQ color tiles go next
+		return ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFBQ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledColor512 )	// 512 color tiles go next
+		return ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColor512].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	Error( "<vitaliy> Cannot compute tiled memory tag base from a non-tiled-pool allocation!\n" );
+#endif
+	return ~0;
+}
+
+uint32 CPs3gcmLocalMemoryBlock::TiledMemoryIndex() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledColorMisc )	// Color tiles are placed in the front
+		return m_uiIndex + kGcmAllocPoolTiledColorMisc - kGcmAllocPoolTiledColorFB;
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are placed as last tiles
+		return 14 - m_uiIndex;
+	return ePool - kGcmAllocPoolTiledColorFB;
+}
+
+uint32 CPs3gcmLocalMemoryBlock::ZcullMemoryIndex() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are the only zcull tiles
+		return m_uiIndex;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	Error( "<vitaliy> Cannot compute zcull index from a non-zcull allocation!\n" );
+#endif
+	return ~0;
+}
+
+uint32 CPs3gcmLocalMemoryBlock::ZcullMemoryStart() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are the only zcull tiles
+		return ( Offset() - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_nOffsetMin ) / 4; // 1 byte per pixel, D24S8 is 4 bytes per pixel, implicitly 4096 aligned because offset is 64Kb aligned
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	Error( "<vitaliy> Cannot compute zcull memory start from a non-zcull allocation!\n" );
+#endif
+	return ~0;
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Allow shaderapi to query GPU memory stats:
+//
+
+void GetGPUMemoryStats( GPUMemoryStats &stats )
+{
+	stats = g_RsxMemoryStats;
+}
--- a/materialsystem/ps3gcm/ps3gcmmemory.cpp
+++ b/materialsystem/ps3gcm/ps3gcmmemory.cpp
@@ -0,0 +1,995 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// GCM memory allocation mgmt
+//
+//==================================================================================================
+
+#include "utlmap.h"
+#include "sys/tty.h"
+#include "convar.h"
+
+#include "ps3gcmmemory.h"
+#include "gcmlabels.h"
+#include "gcmstate.h"
+#include "gcmdrawstate.h"
+
+#include "memdbgon.h"
+
+PLATFORM_OVERRIDE_MEM_ALLOC_INTERNAL_PS3_IMPL
+
+#define HARDWARE_CURSOR_SIZE (64*64*4)
+
+//--------------------------------------------------------------------------------------------------
+// GCM memory allocators
+//--------------------------------------------------------------------------------------------------
+
+#if 1 // #ifndef _CERT
+#define TRACK_ALLOC_STATS 1
+#endif
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+ConVar r_ps3_gcmnocompact( "r_ps3_gcmnocompact", "0" );
+ConVar r_ps3_gcmlowcompact( "r_ps3_gcmlowcompact", "1" );
+#endif
+
+static CThreadFastMutex s_AllocMutex;
+static int32 s_uiGcmLocalMemoryAllocatorMutexLockCount;
+struct CGcmLocalMemoryAllocatorMutexLockCounter_t
+{
+	CGcmLocalMemoryAllocatorMutexLockCounter_t() { Assert( s_uiGcmLocalMemoryAllocatorMutexLockCount >= 0 ); ++ s_uiGcmLocalMemoryAllocatorMutexLockCount; }
+	~CGcmLocalMemoryAllocatorMutexLockCounter_t() { Assert( s_uiGcmLocalMemoryAllocatorMutexLockCount > 0 ); -- s_uiGcmLocalMemoryAllocatorMutexLockCount; }
+};
+#define PS3ALLOCMTX AUTO_LOCK( s_AllocMutex ); CGcmLocalMemoryAllocatorMutexLockCounter_t aLockCounter;
+bool IsItSafeToRefreshFrontBufferNonInteractivePs3()
+{
+	// NOTE: only main thread can refresh front buffer
+	if ( !ThreadInMainThread() )
+		return false;
+	
+	AUTO_LOCK( s_AllocMutex );
+	Assert( s_uiGcmLocalMemoryAllocatorMutexLockCount >= 0 );
+	return s_uiGcmLocalMemoryAllocatorMutexLockCount <= 0;
+}
+
+struct CPs3gcmLocalMemoryBlockMutable : public CPs3gcmLocalMemoryBlock
+{
+	inline uint32 & MutableOffset() { return m_nLocalMemoryOffset; }
+	inline uint32 & MutableSize() { return m_uiSize; }
+	inline CPs3gcmAllocationType_t & MutableType() { return m_uType; }
+	inline uint32 & MutableIndex() { return m_uiIndex; }
+};
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+static const uint64 g_GcmLocalMemoryBlockDebugCookieAllocated = 0xA110CA7EDA110CA7ull;
+static const uint64 g_GcmLocalMemoryBlockDebugCookieFree = 0xFEEFEEFEEFEEFEEFllu;
+#endif
+
+struct CPs3gcmLocalMemoryAllocator
+{
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Allocated memory tracking
+	//
+	uint32 m_nOffsetMin;	// RSX Local Memory allocated by Initialization that will never be released
+	uint32 m_nOffsetMax;	// Ceiling of allocatable RSX Local Memory (because the top portion is reserved for zcull/etc.), top portion managed separately
+	uint32 m_nOffsetUnallocated; // RSX Local Memory offset of not yet allocated memory (between Min and Max)
+
+	CUtlVector< CPs3gcmLocalMemoryBlockMutable * > m_arrAllocations;	// Sorted array of all allocations
+
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Free blocks tracking
+	//
+	struct LocalMemoryAllocation_t
+	{
+		CPs3gcmLocalMemoryBlockMutable m_block;
+		uint32 m_uiFenceNumber;
+		LocalMemoryAllocation_t *m_pNext;
+	};
+
+	LocalMemoryAllocation_t *m_pPendingFreeBlock;
+	LocalMemoryAllocation_t *m_pFreeBlock;
+
+	static uint32 sm_uiFenceNumber;
+	uint32 m_uiFenceLastKnown;
+	static uint32 volatile *sm_puiFenceLocation;
+
+	//////////////////////////////////////////////////////////////////////////
+	//
+	// Implementation
+	//
+	inline bool Alloc( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock );
+	inline void Free( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock );
+	inline uint32 Reclaim( bool bForce = false );
+	inline void Compact();
+
+	// Helper methods
+	inline LocalMemoryAllocation_t * FindFreeBlock( uint32 uiAlignBytes, uint32 uiSize );
+	inline bool IsFenceCompleted( uint32 uiCurrentFenceValue, uint32 uiCheckStoredFenceValue );
+	inline void TrackAllocStats( CPs3gcmAllocationType_t uAllocType, int nDelta );
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	inline void ValidateAllBlocks();
+#endif
+}
+g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolCount];
+uint32 CPs3gcmLocalMemoryAllocator::sm_uiFenceNumber = 1;
+uint32 volatile * CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation;
+
+// RSX memory usage stats tracking:
+static GPUMemoryStats g_RsxMemoryStats;
+struct GPUMemoryStats_Pool
+{
+	int nDefaultPoolSize;
+	int nDefaultPoolUsed;
+	int nRTPoolUsed;
+	int nDynamicPoolUsed;
+	int nMainMemUsed;
+	int nUnknownPoolUsed;
+};
+GPUMemoryStats_Pool g_RsxMemoryStats_Pool;
+
+static inline uint32 Ps3gcmHelper_ComputeTiledAreaMemorySize( uint32 nCount, uint32 w, uint32 h, uint32 bpp )
+{
+	uint32 nTilePitch = cellGcmGetTiledPitchSize( w * bpp );
+	uint32 uiSize = nTilePitch * AlignValue( h, 32 );
+	uiSize *= nCount;
+	uiSize = AlignValue( uiSize, PS3GCMALLOCATIONALIGN( kAllocPs3gcmColorBufferMisc ) );
+	return uiSize;
+}
+
+void Ps3gcmLocalMemoryAllocator_Init()
+{
+	PS3ALLOCMTX
+
+	if ( !CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation )
+	{
+		CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation = cellGcmGetLabelAddress( GCM_LABEL_MEMORY_FREE );
+		*CPs3gcmLocalMemoryAllocator::sm_puiFenceLocation = 0;
+	}
+	
+	// Pool boundaries
+	uint32 uiGcmAllocBegin = g_ps3gcmGlobalState.m_nLocalBaseOffset;
+	uint32 uiGcmAllocEnd = uiGcmAllocBegin + g_ps3gcmGlobalState.m_nLocalSize;
+
+	// Memory should be allocated for large frame buffers
+	uint32 uiMemorySizeBuffer[2] = { MAX( 1280, g_ps3gcmGlobalState.m_nRenderSize[0] ), MAX( 720, g_ps3gcmGlobalState.m_nRenderSize[1] ) };
+	uint32 uiFactor[2] = { uiMemorySizeBuffer[0]*uiMemorySizeBuffer[1], 1280*720 };
+
+	// Configuration of pool memory (can be #ifdef'd for every game)
+	static const uint32 s_PoolMemoryLayout[/*kGcmAllocPoolCount*/] =
+	{
+#if defined( CSTRIKE15 )
+		// mhansen - We had to adjust the memory values a bit for cstrike15 to get a map to load
+		// PS3_BUILDFIX - We need to revisit this to determine the proper size later on
+		/*kGcmAllocPoolDefault			= */	0,
+		/*kGcmAllocPoolDynamicNewPath	= */	6 * 1024 * 1024,	// 5 MB
+		/*kGcmAllocPoolDynamic			= */	11 * 1024 * 1024,	// 11 MB
+
+		/*kGcmAllocPoolTiledColorFB		= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2 + CPs3gcmDisplay::SURFACE_COUNT, uiMemorySizeBuffer[0], uiMemorySizeBuffer[1], 4 ),	
+																		// 2 buffers allocated in CreateRSXBuffers + 2 _rt_fullFrameFB - can probably get this down if... 
+																		//		1. we clean up the post-pro rendering to use the front buffer as a textureand 
+																		//		2. tidy up aliasing for rt_fullframeFB and rt_fullFrameFB1
+
+		/*kGcmAllocPoolTiledColorFBQ	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2, uiMemorySizeBuffer[0]/4, uiMemorySizeBuffer[1]/4, 4 ),	// fits 2 1/4 size framebuffer textures
+		/*kGcmAllocPoolTiledColor512	= */	0,
+													
+		/*kGcmAllocPoolTiledColorMisc	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 1, 640, 640, 4 )		// RTT shadows	?	
+												+ Ps3gcmHelper_ComputeTiledAreaMemorySize( 2, 1024, 512, 4)		// Water
+												+ Ps3gcmHelper_ComputeTiledAreaMemorySize(1, 32, 32, 4),		// Eye Glint
+																										 																					 
+
+		/*kGcmAllocPoolTiledD24S8		= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 1, 640*2, 640*2, 2)  
+												+ Ps3gcmHelper_ComputeTiledAreaMemorySize(1, 640, 640, 2)				 // CSM and Flashlight
+												+ Ps3gcmHelper_ComputeTiledAreaMemorySize( 1, uiMemorySizeBuffer[0], uiMemorySizeBuffer[1], 4 ), // Main depth buffer
+		/*kGcmAllocPoolMainMemory		= */	0,	// configured based on mapped IO memory
+		/*kGcmAllocPoolMallocMemory		= */	0,	// using malloc
+#else
+		/*kGcmAllocPoolDefault			= */	0,
+		/*kGcmAllocPoolDynamicNewPath	= */	5 * 1024 * 1024,	// 5 MB
+		/*kGcmAllocPoolDynamic			= */	10 * 1024 * 1024,	// 10 MB
+		/*kGcmAllocPoolTiledColorFB		= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2 * CPs3gcmDisplay::SURFACE_COUNT, uiMemorySizeBuffer[0], uiMemorySizeBuffer[1], 4 ),	// fits 6	of full framebuffer textures
+		/*kGcmAllocPoolTiledColorFBQ	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 4, uiMemorySizeBuffer[0]/4, uiMemorySizeBuffer[1]/4, 4 ),	// fits 4	quarters of framebuffer textures
+		/*kGcmAllocPoolTiledColor512	= */	Ps3gcmHelper_ComputeTiledAreaMemorySize( 2, 512, 512, 4 ),	// fits 2   512x512 RGBA textures
+		/*kGcmAllocPoolTiledColorMisc	= */	5 * 1024 * 1024,	//  5 MB
+		/*kGcmAllocPoolTiledD24S8		= */	uint64( 15 * 1024 * 1024 ) * uiFactor[0]/uiFactor[1],	// 15 MB
+		/*kGcmAllocPoolMainMemory		= */	0,	// configured based on mapped IO memory
+		/*kGcmAllocPoolMallocMemory		= */	0,	// using malloc
+#endif
+	};
+	COMPILE_TIME_ASSERT( ARRAYSIZE( s_PoolMemoryLayout ) == ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ) );
+
+	for ( int j = ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); j -- > 0; )
+	{
+		const uint32 uiSize = AlignValue( s_PoolMemoryLayout[j], 1024 * 1024 ); // Align it on 1 MB boundaries, all our pools are large
+		g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMax = uiGcmAllocEnd;
+		uiGcmAllocEnd -= uiSize;
+		g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMin =
+			g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetUnallocated = uiGcmAllocEnd;
+	}
+
+	// Default pool setup (rest of local memory)
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMax = uiGcmAllocEnd;
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMin =
+		g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetUnallocated = uiGcmAllocBegin + HARDWARE_CURSOR_SIZE;
+
+	// Main memory mapped pool
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetMin =
+		g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetUnallocated = uint32( g_ps3gcmGlobalState.m_pRsxMainMemoryPoolBuffer ) + g_ps3gcmGlobalState.m_nIoOffsetDelta;
+	g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetMax = g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ].m_nOffsetMin + g_ps3gcmGlobalState.m_nRsxMainMemoryPoolBufferSize;
+
+	// Store initial capacity for memory stats tracking:
+	g_RsxMemoryStats.nGPUMemSize = g_ps3gcmGlobalState.m_nLocalSize;
+	g_RsxMemoryStats_Pool.nDefaultPoolSize = g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ].m_nOffsetMin;
+
+	//
+	// Setup preset tiled regions
+	//
+	{
+		CPs3gcmAllocationPool_t ePool = kGcmAllocPoolTiledColorFB;
+		uint8 uiBank = 0;	// bank 0..3
+		uint32 nRenderPitch = cellGcmGetTiledPitchSize( g_ps3gcmGlobalState.m_nRenderSize[0] * 4 );
+		uint8 uiTileIndex = ePool - kGcmAllocPoolTiledColorFB;
+		cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			nRenderPitch, CELL_GCM_COMPMODE_DISABLED,
+			( g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolTiledColorFB ].m_nOffsetMin ) / 0x10000, // The area base + size/0x10000 will be allocated as the tag area.
+			uiBank );
+		cellGcmBindTile( uiTileIndex );
+	}
+	{
+		CPs3gcmAllocationPool_t ePool = kGcmAllocPoolTiledColorFBQ;
+		uint8 uiBank = 1;	// bank 0..3
+		uint32 nRenderPitch = cellGcmGetTiledPitchSize( g_ps3gcmGlobalState.m_nRenderSize[0] * 4 / 4 );
+		uint8 uiTileIndex = ePool - kGcmAllocPoolTiledColorFB;
+		cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			nRenderPitch, CELL_GCM_COMPMODE_DISABLED,
+			( g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolTiledColorFB ].m_nOffsetMin ) / 0x10000, // The area base + size/0x10000 will be allocated as the tag area.
+			uiBank );
+		cellGcmBindTile( uiTileIndex );
+	}
+	{
+		CPs3gcmAllocationPool_t ePool = kGcmAllocPoolTiledColor512;
+		uint8 uiBank = 2;	// bank 0..3
+		uint32 nRenderPitch = cellGcmGetTiledPitchSize( 512 * 4 );
+		uint8 uiTileIndex = ePool - kGcmAllocPoolTiledColorFB;
+		cellGcmSetTileInfo( uiTileIndex, CELL_GCM_LOCATION_LOCAL,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin,
+			nRenderPitch, CELL_GCM_COMPMODE_DISABLED,
+			( g_ps3gcmLocalMemoryAllocator[ ePool ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolTiledColorFB ].m_nOffsetMin ) / 0x10000, // The area base + size/0x10000 will be allocated as the tag area.
+			uiBank );
+		cellGcmBindTile( uiTileIndex );
+	}
+
+#ifndef _CERT
+	static const char * s_PoolMemoryNames[] =
+	{
+		/*kGcmAllocPoolDefault			= */	"Default Pool",
+		/*kGcmAllocPoolDynamicNewPath	= */	"Dynamic New ",
+		/*kGcmAllocPoolDynamic			= */	"Dynamic IBVB",
+		/*kGcmAllocPoolTiledColorFB		= */	"FullFrameRTs",
+		/*kGcmAllocPoolTiledColorFBQ	= */	"1/4Frame RTs",
+		/*kGcmAllocPoolTiledColor512	= */	"512x512 RTs ",
+		/*kGcmAllocPoolTiledColorMisc	= */	"All Misc RTs",
+		/*kGcmAllocPoolTiledD24S8		= */	"DepthStencil",
+		/*kGcmAllocPoolMainMemory		= */	"Main Memory ",
+		/*kGcmAllocPoolMallocMemory		= */	"MallocMemory",
+	};
+	COMPILE_TIME_ASSERT( ARRAYSIZE( s_PoolMemoryNames ) == ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ) );
+
+	Msg( "RSX Local Memory layout:\n" );
+	for ( int j = 0; j < ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); ++ j )
+	{
+		Msg( "    %s    0x%08X - 0x%08X   [ %9.3f MB ]\n",
+			s_PoolMemoryNames[j],
+			g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMin,
+			g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMax,
+			(g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[ j ].m_nOffsetMin) / 1024.f / 1024.f );
+	}
+	Msg( "Total size: %d MB\n", g_ps3gcmGlobalState.m_nLocalSize / 1024 / 1024 );
+#endif
+}
+
+void Ps3gcmLocalMemoryAllocator_Reclaim()
+{
+		PS3ALLOCMTX
+	for ( int k = 0; k < ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); ++ k )
+		g_ps3gcmLocalMemoryAllocator[ k ].Reclaim();
+}
+
+void Ps3gcmLocalMemoryAllocator_Compact()
+{
+    PS3ALLOCMTX
+
+
+#define PS3GCMCOMPACTPROFILE 0
+#if PS3GCMCOMPACTPROFILE
+	float flTimeStart = Plat_FloatTime();
+	uint32 uiFree = g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated;
+#endif
+
+	// Let PPU wait for all RSX commands done (include waitFlip)
+		
+
+	// Flush GPU right up to current point - Endframe call does this...
+	gpGcmDrawState->EndFrame();
+	gpGcmDrawState->CmdBufferFinish();
+
+#if PS3GCMCOMPACTPROFILE
+	float flTimeWait = Plat_FloatTime() - flTimeStart;
+#endif
+
+	{
+// Locking out memory mgmt for the whole of the compact before this
+//		PS3ALLOCMTX
+		for ( int k = 0; k < ARRAYSIZE( g_ps3gcmLocalMemoryAllocator ); ++ k )
+		{
+			g_ps3gcmLocalMemoryAllocator[ k ].Compact();
+		}
+	}
+	
+#if PS3GCMCOMPACTPROFILE
+	float flTimePrepareTransfer = Plat_FloatTime() - flTimeStart;
+#endif
+
+	// Wait for all RSX memory to be transferred
+	gpGcmDrawState->EndFrame();
+gpGcmDrawState->CmdBufferFinish();
+
+#if PS3GCMCOMPACTPROFILE
+	float flTimeDone = Plat_FloatTime() - flTimeStart;
+	char chBuffer[64];
+	Q_snprintf( chBuffer, ARRAYSIZE( chBuffer ), "COMPACT: %0.3f / %0.3f / %0.3f sec\n",
+		flTimeWait, flTimePrepareTransfer, flTimeDone );
+	uint32 dummy;
+	sys_tty_write( SYS_TTYP6, chBuffer, Q_strlen( chBuffer ), &dummy );
+	Q_snprintf( chBuffer, ARRAYSIZE( chBuffer ), "COMPACT: %0.3f -> %0.3f MB (%0.3f MB free)\n",
+		uiFree / 1024.f / 1024.f, g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated / 1024.f / 1024.f,
+		(g_ps3gcmLocalMemoryAllocator[0].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f );
+	sys_tty_write( SYS_TTYP6, chBuffer, Q_strlen( chBuffer ), &dummy );
+#endif
+}
+
+void Ps3gcmLocalMemoryAllocator_CompactWithReason( char const *szReason )
+{
+	double flTimeCompactStart = Plat_FloatTime();
+	DevMsg( "====== GCM LOCAL MEMORY COMPACT : %s =====\n", szReason );
+	uint32 uiFreeMemoryBeforeCompact = g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated;
+	DevMsg( "RSX Local Memory Free:  %0.3f MB; compacting...\n", (g_ps3gcmLocalMemoryAllocator[0].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f );
+
+	Ps3gcmLocalMemoryAllocator_Compact();
+
+	DevMsg( "RSX Local Memory Compacted %0.3f MB in %0.3f sec\n",
+		(uiFreeMemoryBeforeCompact - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f,
+		Plat_FloatTime() - flTimeCompactStart );
+	DevMsg( "RSX Local Memory Free:  %0.3f MB\n", (g_ps3gcmLocalMemoryAllocator[0].m_nOffsetMax - g_ps3gcmLocalMemoryAllocator[0].m_nOffsetUnallocated) / 1024.f / 1024.f );
+}
+
+
+bool CPs3gcmLocalMemoryBlock::Alloc()
+{
+	PS3ALLOCMTX
+	return g_ps3gcmLocalMemoryAllocator[PS3GCMALLOCATIONPOOL(m_uType)].Alloc( reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( this ) );
+}
+
+void CPs3gcmLocalMemoryBlock::Free()
+{
+	PS3ALLOCMTX
+	g_ps3gcmLocalMemoryAllocator[PS3GCMALLOCATIONPOOL(m_uType)].Free( reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( this ) );
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Private implementation of PS3 local memory allocator
+//
+
+inline bool CPs3gcmLocalMemoryAllocator::Alloc( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock )
+{
+	TrackAllocStats( pBlock->MutableType(), pBlock->MutableSize() );
+
+	uint32 uAlignBytes = PS3GCMALLOCATIONALIGN( pBlock->MutableType() );
+	Assert( IsPowerOfTwo( uAlignBytes ) );
+
+	double flAllocatorStallTime = 0.0f;
+	bool bCompactPerformed = false;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	bCompactPerformed = !r_ps3_gcmlowcompact.GetBool();
+#endif
+
+retry_allocation:
+	// Try to find a free block
+	if ( LocalMemoryAllocation_t *pFreeBlock = FindFreeBlock( uAlignBytes, pBlock->MutableSize() ) )
+	{
+		pBlock->MutableOffset() = pFreeBlock->m_block.MutableOffset();
+		pBlock->MutableIndex() = pFreeBlock->m_block.MutableIndex();
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+		if ( m_arrAllocations[ pBlock->MutableIndex() ] != &pFreeBlock->m_block )
+			Error( "<vitaliy> GCM Local Memory Allocator Error (attempt to reuse invalid free block)!" );
+#endif
+		m_arrAllocations[ pBlock->MutableIndex() ] = reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( pBlock );
+		delete pFreeBlock;
+	}
+	else if ( this != &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMallocMemory ] )
+	{
+		// Allocate new block
+		uint32 uiOldUnallocatedEdge = m_nOffsetUnallocated;
+		uint32 uiFreeBlock = ( m_nOffsetUnallocated + uAlignBytes - 1 ) & ~( uAlignBytes - 1 );
+
+		// Check if there's enough space in this pool for the requested block
+		if ( uiFreeBlock + pBlock->MutableSize() > m_nOffsetMax )
+		{
+			// There's not enough space in this pool
+			if ( m_pPendingFreeBlock )
+			{
+				// There are pending free blocks, we just need to wait for
+				// RSX to finish rendering using them
+				if ( !flAllocatorStallTime )
+				{
+					flAllocatorStallTime = Plat_FloatTime();
+	
+					// Flush GPU right up to current point - Endframe call does this...
+					gpGcmDrawState->EndFrame();
+					gpGcmDrawState->CmdBufferFlush();
+				}
+				while ( Reclaim() < pBlock->MutableSize() && m_pPendingFreeBlock )
+				{
+					ThreadSleep( 1 );
+				}
+				goto retry_allocation;
+			}
+			else if ( !bCompactPerformed )
+			{
+				if (this == &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ]) return false;
+
+				// Let PPU wait for all RSX commands done
+				gpGcmDrawState->EndFrame();
+				gpGcmDrawState->CmdBufferFinish();
+
+				uint32 uiFragmentedFreeSpace = m_nOffsetMax - m_nOffsetUnallocated;
+				for ( LocalMemoryAllocation_t *pFreeFragment = m_pFreeBlock; pFreeFragment; pFreeFragment = pFreeFragment->m_pNext )
+					uiFragmentedFreeSpace += pFreeFragment->m_block.MutableSize();
+				Warning(
+					"**************** GCM LOCAL MEMORY LOW *****************\n"
+					"<vitaliy> GCM Local Memory Allocator#%d pool compacting!\n"
+					"  Requested allocation %u bytes.\n"
+					"  Pool capacity %u bytes.\n"
+					"  Free fragmented space %u bytes.\n"
+					"  Unallocated %u bytes.\n"
+					"  Used %u bytes.\n",
+					this - g_ps3gcmLocalMemoryAllocator,
+					( uint32 ) pBlock->MutableSize(),
+					m_nOffsetMax - m_nOffsetMin,
+					uiFragmentedFreeSpace,
+					m_nOffsetMax - m_nOffsetUnallocated,
+					m_nOffsetUnallocated - m_nOffsetMin
+					);
+				Compact();
+				Warning( "  ---> Compacted pool#%d has %u unallocated bytes.\n",
+					this - g_ps3gcmLocalMemoryAllocator,
+					m_nOffsetMax - m_nOffsetUnallocated );
+				bCompactPerformed = true;
+
+				// Wait for all RSX memory to be transferred
+				gpGcmDrawState->EndFrame();
+				gpGcmDrawState->CmdBufferFinish();
+				goto retry_allocation;
+			}
+			else
+			{
+				// Main memory pool returns failure so caller can try local pool.
+
+				if (this == &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMainMemory ]) return false;
+
+				uint32 uiFragmentedFreeSpace = m_nOffsetMax - m_nOffsetUnallocated;
+				for ( LocalMemoryAllocation_t *pFreeFragment = m_pFreeBlock; pFreeFragment; pFreeFragment = pFreeFragment->m_pNext )
+					uiFragmentedFreeSpace += pFreeFragment->m_block.MutableSize();
+				Error(
+					"********* OUT OF GCM LOCAL MEMORY ********************\n"
+					"<vitaliy> GCM Local Memory Allocator#%d pool exhausted!\n"
+					"  Failed allocation %u bytes.\n"
+					"  Pool capacity %u bytes.\n"
+					"  Free fragmented space %u bytes.\n"
+					"  Unallocated %u bytes.\n"
+					"  Used %u bytes.\n",
+					this - g_ps3gcmLocalMemoryAllocator,
+					( uint32 ) pBlock->MutableSize(),
+					m_nOffsetMax - m_nOffsetMin,
+					uiFragmentedFreeSpace,
+					m_nOffsetMax - m_nOffsetUnallocated,
+					m_nOffsetUnallocated - m_nOffsetMin
+					);
+			}
+		}
+
+		// update the pointer to "unallocated" realm
+		m_nOffsetUnallocated = uiFreeBlock + pBlock->MutableSize();
+
+		// this is the last allocation so far
+		pBlock->MutableIndex() = m_arrAllocations.AddToTail( reinterpret_cast< CPs3gcmLocalMemoryBlockMutable * >( pBlock ) );
+		pBlock->MutableOffset() = uiFreeBlock;
+	}
+	else
+	{
+		MEM_ALLOC_CREDIT_( "GCM Malloc Pool" );
+		void *pvMallocMemory = MemAlloc_AllocAligned( pBlock->MutableSize(), uAlignBytes );
+		pBlock->MutableOffset() = (uint32) pvMallocMemory;
+		pBlock->MutableIndex() = ~0;
+	}
+
+// 7LTODO 	if ( flAllocatorStallTime )
+//		g_ps3gcmGlobalState.m_flAllocatorStallTimeWaitingRSX += Plat_FloatTime() - flAllocatorStallTime;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	// PS3 doesn't allow more than 8 zcull regions (index 0..7)
+	if ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_arrAllocations.Count() > 8 )
+		Error( "PS3 number of zcull regions exceeded!\n" );
+	// PS3 doesn't allow more than 15 tiles regions (index 0..14)
+	if ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_arrAllocations.Count() +
+			g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorMisc].m_arrAllocations.Count() +
+			( kGcmAllocPoolTiledColorMisc - kGcmAllocPoolTiledColorFB )
+			> 15 )
+		Error( "PS3 number of tiled regions exceeded!\n" );
+	pBlock->m_dbgGuardCookie = g_GcmLocalMemoryBlockDebugCookieAllocated;
+#endif
+
+	return true;
+
+}
+
+inline void CPs3gcmLocalMemoryAllocator::Free( CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock )
+{
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	if ( !pBlock ||
+		pBlock->m_dbgGuardCookie != g_GcmLocalMemoryBlockDebugCookieAllocated ||
+		( ( pBlock->MutableIndex() != ~0 ) && ( m_arrAllocations[ pBlock->MutableIndex() ] != pBlock ) ) )
+		{
+			//DebuggerBreak();
+			Error( "<vitaliy> Attempt to free not allocated GCM local memory block!" );
+		}
+	pBlock->m_dbgGuardCookie = g_GcmLocalMemoryBlockDebugCookieFree;
+#endif
+
+	LocalMemoryAllocation_t *pDealloc = new LocalMemoryAllocation_t;
+	pDealloc->m_block = *pBlock;
+	pDealloc->m_uiFenceNumber = sm_uiFenceNumber;
+	sm_uiFenceNumber ++;
+	if(!sm_uiFenceNumber)sm_uiFenceNumber = 1;
+	pDealloc->m_pNext = m_pPendingFreeBlock;
+	gpGcmDrawState->SetWriteBackEndLabel(GCM_LABEL_MEMORY_FREE, sm_uiFenceNumber);
+
+	m_pPendingFreeBlock = pDealloc;
+
+	TrackAllocStats( pBlock->MutableType(), - pBlock->MutableSize() );
+	if ( pBlock->MutableIndex() != ~0 )
+	{
+		#ifdef GCMLOCALMEMORYBLOCKDEBUG
+		if ( m_arrAllocations[ pBlock->MutableIndex() ] != pBlock )
+			Error( "<vitaliy> GCM Local Memory Allocator Error (freeing block that is not properly registered)!" );
+		#endif
+		m_arrAllocations[ pBlock->MutableIndex() ] = &pDealloc->m_block;
+	}
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	pBlock->MutableOffset() = ~0;
+	pBlock->MutableIndex() = ~0;
+#endif
+}
+
+inline bool CPs3gcmLocalMemoryAllocator::IsFenceCompleted( uint32 uiCurrentFenceValue, uint32 uiCheckStoredFenceValue )
+{
+#if GCM_ALLOW_NULL_FLIPS
+	extern bool g_ps3_nullflips;
+	if ( g_ps3_nullflips )
+		return true;
+#endif
+	// Needs to handle the counter wrapping around
+	return ( ( uiCurrentFenceValue - m_uiFenceLastKnown ) >= ( uiCheckStoredFenceValue - m_uiFenceLastKnown ) );
+}
+
+inline uint32 CPs3gcmLocalMemoryAllocator::Reclaim( bool bForce )
+{
+	uint32 uiLargestBlockSizeReclaimed = 0;
+	uint32 uiCurrentFenceValue = *sm_puiFenceLocation;
+
+	// Walk pending free blocks and see if they are no longer
+	// in use by RSX:
+	LocalMemoryAllocation_t **p = &m_pPendingFreeBlock;
+	if ( !bForce ) while ( (*p) && !IsFenceCompleted( uiCurrentFenceValue, (*p)->m_uiFenceNumber ) )
+		p = &( (*p)->m_pNext );
+
+	// Now p is pointing to the chain of free blocks
+	// chain that has been completed (due to the nature of
+	// pushing new deallocation at the head of the pending
+	// list)
+	if ( *p )
+	{
+		LocalMemoryAllocation_t *pCompletedChain = *p;
+		*p = NULL;	// Terminate the chain
+
+		// Handle the special case of malloc reclaim - free all memory
+		if ( this == &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolMallocMemory ] )
+		{
+			MEM_ALLOC_CREDIT_( "GCM Malloc Pool" );
+			for ( LocalMemoryAllocation_t *pActualFree = pCompletedChain; pActualFree; )
+			{
+				MemAlloc_FreeAligned( pActualFree->m_block.DataInMallocMemory() );
+				LocalMemoryAllocation_t *pDelete = pActualFree;
+				pActualFree = pActualFree->m_pNext;
+				delete pDelete;
+			}
+			pCompletedChain = NULL;
+		}
+
+		// Relink the completed pending chain into
+		// the free blocks chain
+		LocalMemoryAllocation_t **ppFree = &m_pFreeBlock;
+		while ( *ppFree )
+			ppFree = &( (*ppFree)->m_pNext );
+		*ppFree = pCompletedChain;
+
+		// Recompute actual free sizes of the completed chain
+		// Actual free size is the delta between block offset and next block offset
+		// When there's no next block then its delta between block offset and unallocated edge
+		for ( LocalMemoryAllocation_t *pActualFree = pCompletedChain; pActualFree; pActualFree = pActualFree->m_pNext )
+		{
+			uint32 uiIdx = pActualFree->m_block.MutableIndex() + 1;
+			uint32 uiNextOffset = m_nOffsetUnallocated;
+			if ( uiIdx < m_arrAllocations.Count() )
+			{
+				CPs3gcmLocalMemoryBlockMutable * RESTRICT pNextBlock = m_arrAllocations[ uiIdx ];
+				uiNextOffset = pNextBlock->Offset();
+			}
+			uint32 uiActualBlockSize = uiNextOffset - pActualFree->m_block.Offset();
+			pActualFree->m_block.MutableSize() = uiActualBlockSize;
+			uiLargestBlockSizeReclaimed = MAX( uiLargestBlockSizeReclaimed, uiActualBlockSize );
+		}
+	}
+
+	// Remember the last known fence value
+	m_uiFenceLastKnown = uiCurrentFenceValue;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	ValidateAllBlocks();
+#endif
+
+	return uiLargestBlockSizeReclaimed;
+}
+
+inline CPs3gcmLocalMemoryAllocator::LocalMemoryAllocation_t * CPs3gcmLocalMemoryAllocator::FindFreeBlock( uint32 uiAlignBytes, uint32 uiSize )
+{
+	LocalMemoryAllocation_t **ppBest = NULL;
+	uint32 uiSizeMax = uiSize * 11/10;	// we don't want to inflate requested size by > 10%
+	for ( LocalMemoryAllocation_t **p = &m_pFreeBlock;
+		(*p);
+		p = &( (*p)->m_pNext ) )
+	{
+		if ( (*p)->m_block.MutableSize() >= uiSize && (*p)->m_block.MutableSize() <= uiSizeMax &&
+			!( (*p)->m_block.Offset() & ( uiAlignBytes - 1 ) ) )
+		{
+			if ( !ppBest || ( (*p)->m_block.MutableSize() <= (*ppBest)->m_block.MutableSize() ) )
+			{
+				ppBest = p;
+			}
+		}
+	}
+	if ( ppBest )
+	{
+		LocalMemoryAllocation_t *pFree = (*ppBest);
+		(*ppBest) = pFree->m_pNext;
+		pFree->m_pNext = NULL;
+		return pFree;
+	}
+	return NULL;
+}
+
+inline bool TrackAllocStats_Pool( CPs3gcmAllocationType_t uAllocType, int nDelta )
+{
+	CPs3gcmAllocationPool_t pool = PS3GCMALLOCATIONPOOL( uAllocType );
+	int *stat = &g_RsxMemoryStats_Pool.nUnknownPoolUsed;
+	bool bInRSXMem = true;
+	switch( pool )
+	{
+	case kGcmAllocPoolDefault:
+		stat = &g_RsxMemoryStats_Pool.nDefaultPoolUsed;
+		break;
+	case kGcmAllocPoolDynamicNewPath:
+	case kGcmAllocPoolDynamic:
+		stat = &g_RsxMemoryStats_Pool.nDynamicPoolUsed;
+		break;
+	case kGcmAllocPoolTiledColorFB:
+	case kGcmAllocPoolTiledColorFBQ:
+	case kGcmAllocPoolTiledColor512:
+	case kGcmAllocPoolTiledColorMisc:
+	case kGcmAllocPoolTiledD24S8:
+		stat = &g_RsxMemoryStats_Pool.nRTPoolUsed;
+		break;
+	case kGcmAllocPoolMainMemory: // Unused, unless PS3GCM_VBIB_IN_IO_MEMORY set to 1
+	case kGcmAllocPoolMallocMemory:
+		stat = &g_RsxMemoryStats_Pool.nMainMemUsed;
+		bInRSXMem = false; // In main memory!
+		break;
+	}
+	*stat += nDelta;
+	Assert( 0 <= (int)*stat );
+
+	// Report free memory only from the default pool (the other pools are pre-sized to fixed limits, and all
+	// geom/textures go into the default pool, so that's where content-driven variation/failures will occur)
+	g_RsxMemoryStats.nGPUMemFree = g_RsxMemoryStats_Pool.nDefaultPoolSize - g_RsxMemoryStats_Pool.nDefaultPoolUsed;
+
+	return bInRSXMem;
+}
+
+inline void CPs3gcmLocalMemoryAllocator::TrackAllocStats( CPs3gcmAllocationType_t uAllocType, int nDelta )
+{
+#if TRACK_ALLOC_STATS
+	// Early-out for allocations not in RSX memory:
+	if ( !TrackAllocStats_Pool( uAllocType, nDelta ) )
+		return;
+
+	unsigned int *stat = &g_RsxMemoryStats.nUnknown;
+	switch( uAllocType )
+	{
+	case kAllocPs3gcmColorBufferMisc:
+	case kAllocPs3gcmColorBufferFB:
+	case kAllocPs3gcmColorBufferFBQ:
+	case kAllocPs3gcmColorBuffer512:
+	case kAllocPs3gcmDepthBuffer:
+		stat = &g_RsxMemoryStats.nRTSize;
+		break;
+	case kAllocPs3gcmTextureData:
+	case kAllocPs3gcmTextureData0:
+		stat = &g_RsxMemoryStats.nTextureSize;
+		break;
+	case kAllocPs3GcmVertexBuffer:
+		stat = &g_RsxMemoryStats.nVBSize;
+		break;
+	case kAllocPs3GcmIndexBuffer:
+		stat = &g_RsxMemoryStats.nIBSize;
+		break;
+
+	case kAllocPs3GcmShader:
+	case kAllocPs3GcmEdgeGeomBuffer:
+	case kAllocPs3GcmVertexBufferDynamic:
+	case kAllocPs3GcmIndexBufferDynamic:
+	case kAllocPs3GcmDynamicBufferPool:
+	case kAllocPs3GcmVertexBufferDma:
+	case kAllocPs3GcmIndexBufferDma:
+		// Treat these as misc unless they become big/variable
+		break;
+	}
+	*stat += nDelta;
+	Assert( 0 <= (int)*stat );
+#endif // TRACK_ALLOC_STATS
+}
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+#define VALIDATECONDITION( x ) if( !( x ) ) { Error( "<vitaliy> GCM Local Memory Allocation block %p index %d is corrupt [line %d]!\n", pBlock, k, __LINE__ ); }
+inline void CPs3gcmLocalMemoryAllocator::ValidateAllBlocks()
+{
+	// Traverse the allocated list and validate debug guards and patch-back indices
+	CUtlVector< uint32 > arrFreeBlocksIdx;
+	uint32 uiLastAllocatedOffset = m_nOffsetMin;
+	for ( int k = 0, kEnd = m_arrAllocations.Count(); k < kEnd; ++ k )
+	{
+		CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock = m_arrAllocations[k];
+		VALIDATECONDITION( pBlock );
+		VALIDATECONDITION( pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieAllocated || pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieFree );
+		VALIDATECONDITION( pBlock->MutableIndex() < m_arrAllocations.Count() );
+		VALIDATECONDITION( pBlock->MutableIndex() == k );
+		VALIDATECONDITION( m_arrAllocations[ pBlock->MutableIndex() ] == pBlock );
+		VALIDATECONDITION( pBlock->Offset() >= uiLastAllocatedOffset );
+		uiLastAllocatedOffset = pBlock->Offset() + pBlock->MutableSize();
+		VALIDATECONDITION( uiLastAllocatedOffset <= m_nOffsetMax );
+		if ( pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieFree )
+			arrFreeBlocksIdx.AddToTail( k );
+	}
+	
+	// Traverse free lists and validate
+	LocalMemoryAllocation_t * arrFree[] = { m_pPendingFreeBlock, m_pFreeBlock };
+	for ( int j = 0; j < ARRAYSIZE( arrFree ); ++ j )
+	for ( LocalMemoryAllocation_t *p = arrFree[j]; p; p = p->m_pNext )
+	{
+		int k = j;
+		CPs3gcmLocalMemoryBlockMutable * RESTRICT pBlock = &p->m_block;
+		VALIDATECONDITION( pBlock );
+		VALIDATECONDITION( pBlock->m_dbgGuardCookie == g_GcmLocalMemoryBlockDebugCookieFree );
+		k = pBlock->MutableIndex();
+		if ( pBlock->MutableIndex() != ~0 )
+		{
+			VALIDATECONDITION( pBlock->MutableIndex() < m_arrAllocations.Count() );
+			VALIDATECONDITION( m_arrAllocations[ pBlock->MutableIndex() ] == pBlock );
+			VALIDATECONDITION( arrFreeBlocksIdx.FindAndFastRemove( pBlock->MutableIndex() ) );
+		}
+	}
+
+	int k = 0;
+	void *pBlock = 0;
+	VALIDATECONDITION( !arrFreeBlocksIdx.Count() );
+}
+#endif
+
+inline void CPs3gcmLocalMemoryAllocator::Compact()
+{
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	ValidateAllBlocks();
+
+	if ( r_ps3_gcmnocompact.GetBool() )
+		return;
+#endif
+
+	// Reclaim all memory (NOTE: all pending blocks must be reclaimed since both RSX and PPU have stopped rendering!)
+	Reclaim();
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	if ( m_pPendingFreeBlock )
+		Warning( "GCM Local Memory Allocator Compact forces pending free blocks to be reclaimed.\n" );
+	ValidateAllBlocks();
+#endif
+
+	if ( m_pPendingFreeBlock )
+		Reclaim( true );
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	if ( m_pPendingFreeBlock )
+		Error( "<vitaliy> GCM Local Memory Allocator Compact requires RSX and PPU rendering to be paused! (pending free blocks have not been reclaimed)\n" );
+	ValidateAllBlocks();
+#endif
+
+	// Walk the free blocks chain and patch-back NULL pointers into allocation tracking system
+	while ( m_pFreeBlock )
+	{
+		LocalMemoryAllocation_t *p = m_pFreeBlock;
+		m_pFreeBlock = p->m_pNext;
+		m_arrAllocations[ p->m_block.MutableIndex() ] = NULL;
+		delete p;
+	}
+	Assert( !m_pFreeBlock && !m_pPendingFreeBlock );
+
+	// These are elements requiring reallocation
+	uint32 uiCount = m_arrAllocations.Count();
+	CPs3gcmLocalMemoryBlockMutable **pReallocationBlocks = m_arrAllocations.Base();
+
+	// Here "correct" implementation would be to copy off m_arrAllocations vector onto stack for iteration,
+	// RemoveAll from m_arrAllocations vector and allocate all blocks again.
+	// We will cheat since we know that we will allocate same number of elements and directly write zero
+	// into m_arrAllocations m_Size member, then we will still be able to use the memory of the vector
+	// for reading blocks requiring compact reallocation, and AddToTail will still fill the vector with
+	// correct data.
+	struct AllocatorCompactVectorCheat : public CUtlVector< CPs3gcmLocalMemoryBlockMutable * > { inline void ResetCountPreservingMemoryContents() { m_Size = 0; } };
+	( ( AllocatorCompactVectorCheat * ) ( char * ) &m_arrAllocations )->ResetCountPreservingMemoryContents();
+	m_nOffsetUnallocated = m_nOffsetMin;
+
+	// Prepare RSX for data buffer transfers in local memory
+	uint nTransferMode = ( ( this - &g_ps3gcmLocalMemoryAllocator[ kGcmAllocPoolDefault ] ) < kGcmAllocPoolMainMemory ) ? CELL_GCM_TRANSFER_LOCAL_TO_LOCAL : CELL_GCM_TRANSFER_MAIN_TO_MAIN;
+	Assert( nTransferMode < 4 );
+
+	// Reallocate all blocks
+	for ( ; uiCount; -- uiCount, ++ pReallocationBlocks )
+	{
+		CPs3gcmLocalMemoryBlockMutable *pBlock = *pReallocationBlocks;
+		if ( !pBlock )
+			continue;
+
+		uint32 nOldOffset = pBlock->Offset();
+		char*  pOldAddress = pBlock->DataInAnyMemory();
+
+		TrackAllocStats( pBlock->MutableType(), - pBlock->MutableSize() );
+		Alloc( pBlock );
+
+		if ( nOldOffset == pBlock->Offset() )
+			continue;
+
+		// Have RSX transfer blocks data. RSX may hang if there's WriteLabel between the Format and Offset commands,
+		// so reserve space for both of them up front
+// 		SpuDrawTransfer_t * pTransfer = g_spuGcm.GetDrawQueue()->AllocWithHeader<SpuDrawTransfer_t>( SPUDRAWQUEUE_TRANSFER_METHOD | nTransferMode );
+// 		pTransfer->m_nLineSize = pBlock->MutableSize();
+// 		pTransfer->m_nOldOffset = nOldOffset;
+// 		pTransfer->m_nNewOffset = pBlock->Offset();
+// 7LTODO
+
+		uint32 uiLineSize = pBlock->MutableSize();
+		uint32 uiLineOffset = 0;
+		const uint nMaxTransferSize = 0x3FFFFF;
+
+		cellGcmReserveMethodSizeInline(gpGcmContext, 0x4000/4);
+
+		GCM_FUNC( cellGcmSetTransferDataMode, nTransferMode );
+
+		int i = 1;
+		do
+		{
+			uint32 uiTransferSize = Min<uint32>( uiLineSize, nMaxTransferSize );
+			GCM_FUNC( cellGcmSetTransferDataFormat, 0, 0, uiTransferSize, 1, 1, 1 );
+			GCM_FUNC( cellGcmSetTransferDataOffset, pBlock->Offset() + uiLineOffset, nOldOffset + uiLineOffset );
+			uiLineSize -= uiTransferSize;
+			uiLineOffset += uiTransferSize;
+	
+			i++;
+
+		}
+		while ( uiLineSize > 0 );
+
+//		V_memmove(pBlock->DataInAnyMemory(), pOldAddress,  pBlock->MutableSize()  );
+
+	}
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	ValidateAllBlocks();
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Computation of tiled memory
+//
+
+uint32 CPs3gcmLocalMemoryBlock::TiledMemoryTagAreaBase() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledColorMisc )	// Misc color tiles are placed at the front of tag area after preset pools
+		return ( Offset() - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are placed in the end of tag area (0-0x7FF is offset range)
+		return 0x800 - ( Offset() - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_nOffsetMin + m_uiSize ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledColorFB )	// FB color tiles go first
+		return ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledColorFBQ )	// FBQ color tiles go next
+		return ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFBQ].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+	if ( ePool == kGcmAllocPoolTiledColor512 )	// 512 color tiles go next
+		return ( g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColor512].m_nOffsetMin - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledColorFB].m_nOffsetMin ) / 0x10000;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	Error( "<vitaliy> Cannot compute tiled memory tag base from a non-tiled-pool allocation!\n" );
+#endif
+	return ~0;
+}
+
+uint32 CPs3gcmLocalMemoryBlock::TiledMemoryIndex() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledColorMisc )	// Color tiles are placed in the front
+		return m_uiIndex + kGcmAllocPoolTiledColorMisc - kGcmAllocPoolTiledColorFB;
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are placed as last tiles
+		return 14 - m_uiIndex;
+	return ePool - kGcmAllocPoolTiledColorFB;
+}
+
+uint32 CPs3gcmLocalMemoryBlock::ZcullMemoryIndex() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are the only zcull tiles
+		return m_uiIndex;
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	Error( "<vitaliy> Cannot compute zcull index from a non-zcull allocation!\n" );
+#endif
+	return ~0;
+}
+
+uint32 CPs3gcmLocalMemoryBlock::ZcullMemoryStart() const
+{
+	CPs3gcmAllocationPool_t ePool = PS3GCMALLOCATIONPOOL(m_uType);
+	if ( ePool == kGcmAllocPoolTiledD24S8 )	// Depth tiles are the only zcull tiles
+		return ( Offset() - g_ps3gcmLocalMemoryAllocator[kGcmAllocPoolTiledD24S8].m_nOffsetMin ) / 4; // 1 byte per pixel, D24S8 is 4 bytes per pixel, implicitly 4096 aligned because offset is 64Kb aligned
+
+#ifdef GCMLOCALMEMORYBLOCKDEBUG
+	Error( "<vitaliy> Cannot compute zcull memory start from a non-zcull allocation!\n" );
+#endif
+	return ~0;
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// Allow shaderapi to query GPU memory stats:
+//
+
+void GetGPUMemoryStats( GPUMemoryStats &stats )
+{
+	stats = g_RsxMemoryStats;
+}
--- a/materialsystem/ps3gcm/ps3gcmmemory.h
+++ b/materialsystem/ps3gcm/ps3gcmmemory.h
@@ -0,0 +1,167 @@
+//================ Copyright (c) Valve Corporation. All Rights Reserved. ===========================
+//
+// Local memory manager
+//
+//==================================================================================================
+
+#ifndef INCLUDED_PS3GCMMEMORY_H
+#define INCLUDED_PS3GCMMEMORY_H
+
+#ifndef SPU
+
+#include "tier1/strtools.h"
+#include "shaderapi/gpumemorystats.h"
+
+#include "cell/gcm.h"
+#include "gcmconfig.h"
+
+#else
+
+
+
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// Externals
+//--------------------------------------------------------------------------------------------------
+
+#ifndef SPU	
+
+extern void GetGPUMemoryStats( GPUMemoryStats &stats );
+extern void Ps3gcmLocalMemoryAllocator_Init();
+
+#endif
+
+//--------------------------------------------------------------------------------------------------
+// Memory Pools, Types and LocalMemoryBlock
+//--------------------------------------------------------------------------------------------------
+
+enum CPs3gcmAllocationPool_t
+{
+	kGcmAllocPoolDefault,
+	kGcmAllocPoolDynamicNewPath,
+	kGcmAllocPoolDynamic,
+	kGcmAllocPoolTiledColorFB,		// Frame-buffer tiled color memory (should be first preset tiled region)
+	kGcmAllocPoolTiledColorFBQ,		// Quarter-frame-buffer tiled color memory
+	kGcmAllocPoolTiledColor512,		// 512x512 tiled color memory
+	kGcmAllocPoolTiledColorMisc,	// Last tiled color region
+	kGcmAllocPoolTiledD24S8,
+	kGcmAllocPoolMainMemory,		// Pool in the main RSX-mapped IO memory
+	kGcmAllocPoolMallocMemory,		// Pool in malloc-backed non-RSX-mapped memory
+	kGcmAllocPoolCount
+};
+
+#define PS3GCMALLOCATIONPOOL( uType ) ( (CPs3gcmAllocationPool_t)( ( ((uint32)(uType)) >> 28 ) & 0xF ) )
+#define PS3GCMALLOCATIONALIGN( uType ) ( ((uint32)(uType)) & 0xFFFFFF )
+#define PS3GCMALLOCATIONTYPE( uAlign, ePool, iType ) (((uint32)(uAlign))&0xFFFFFF) | ( (((uint32)(iType))&0xF) << 24 ) | ( (((uint32)(ePool))&0xF) << 28 )
+
+enum CPs3gcmAllocationType_t
+{
+	// Default pool
+	kAllocPs3gcmTextureData0 =			PS3GCMALLOCATIONTYPE( 128,		kGcmAllocPoolMainMemory,		0 ),
+	kAllocPs3gcmTextureData =			PS3GCMALLOCATIONTYPE( 128,		kGcmAllocPoolDefault,		1 ),
+	kAllocPs3GcmVertexBuffer =			PS3GCMALLOCATIONTYPE( 32,		kGcmAllocPoolDefault,		2 ), 
+	kAllocPs3GcmIndexBuffer =			PS3GCMALLOCATIONTYPE( 32,		kGcmAllocPoolDefault,		3 ), 
+	kAllocPs3GcmShader =				PS3GCMALLOCATIONTYPE( 128,		kGcmAllocPoolDefault,		4 ), 
+	kAllocPs3GcmEdgeGeomBuffer =		PS3GCMALLOCATIONTYPE( 128,		kGcmAllocPoolDefault,		5 ),
+
+	// Dynamic pool
+	kAllocPs3GcmVertexBufferDynamic =	PS3GCMALLOCATIONTYPE( 32,		kGcmAllocPoolDynamic,		1 ), 
+	kAllocPs3GcmIndexBufferDynamic =	PS3GCMALLOCATIONTYPE( 32,		kGcmAllocPoolDynamic,		2 ), 
+	kAllocPs3GcmDynamicBufferPool =		PS3GCMALLOCATIONTYPE( 32,		kGcmAllocPoolDynamicNewPath, 1 ),
+
+	// Malloc memory pool
+	kAllocPs3GcmVertexBufferDma =		PS3GCMALLOCATIONTYPE( 32,		kGcmAllocPoolMallocMemory,	1 ),
+	kAllocPs3GcmIndexBufferDma =		PS3GCMALLOCATIONTYPE( 32,		kGcmAllocPoolMallocMemory,	2 ),
+
+	// Tiled pools
+	kAllocPs3gcmColorBufferFB =			PS3GCMALLOCATIONTYPE( 64,		kGcmAllocPoolTiledColorFB,	1 ),
+	kAllocPs3gcmColorBufferFBQ =		PS3GCMALLOCATIONTYPE( 64,		kGcmAllocPoolTiledColorFBQ,	1 ),
+	kAllocPs3gcmColorBuffer512 =		PS3GCMALLOCATIONTYPE( 64,		kGcmAllocPoolTiledColor512,	1 ),
+	kAllocPs3gcmColorBufferMisc =		PS3GCMALLOCATIONTYPE( 64*1024,	kGcmAllocPoolTiledColorMisc,1 ),
+	kAllocPs3gcmDepthBuffer =			PS3GCMALLOCATIONTYPE( 64*1024,	kGcmAllocPoolTiledD24S8,	1 ),
+};
+
+struct CPs3gcmLocalMemoryBlockSystemGlobal;
+
+struct ALIGN16 CPs3gcmLocalMemoryBlock
+{
+public:
+	CPs3gcmLocalMemoryBlock() {}
+
+#if 0
+#define GCMLOCALMEMORYBLOCKDEBUG
+	uint64 m_dbgGuardCookie;			// Debug cookie used to guard when calling code let block go out of scope without freeing it
+#endif
+
+protected:
+	uint32 m_nLocalMemoryOffset;		// Offset in RSX local memory
+	uint32 m_uiSize;					// Actual allocation size, might be larger than requested allocation size
+	CPs3gcmAllocationType_t m_uType;	// Allocation type with required alignment
+	uint32 m_uiIndex;					// Index of the allocation in allocation tracking system
+
+	bool Alloc();						// Internal implementation of Local Memory Allocator
+
+	// Prevent copying (since patch-back mechanism needs to access the allocated blocks)
+	CPs3gcmLocalMemoryBlock( CPs3gcmLocalMemoryBlock const &x ) { V_memcpy( this, &x, sizeof( CPs3gcmLocalMemoryBlock ) ); }
+	CPs3gcmLocalMemoryBlock& operator =( CPs3gcmLocalMemoryBlock const &x ) { V_memcpy( this, &x, sizeof( CPs3gcmLocalMemoryBlock ) ); return *this; }
+
+public:
+	inline void Assign( CPs3gcmLocalMemoryBlockSystemGlobal const &x ) { V_memcpy( this, &x, sizeof( CPs3gcmLocalMemoryBlock ) ); }
+	inline bool Alloc( CPs3gcmAllocationType_t uType, uint32 uiSize ) { m_uType = uType; m_uiSize = uiSize; return Alloc(); }
+	inline void AttachToExternalMemory( CPs3gcmAllocationType_t uType, uint32 nOffset, uint32 uiSize ) { m_uType = uType; m_uiSize = uiSize; m_nLocalMemoryOffset = nOffset; m_uiIndex = ~0; }
+	void Free();
+	void FreeAndAllocNew() { Free(); Alloc(); }
+
+	inline uint32 Offset() const { return m_nLocalMemoryOffset; }
+	inline uint32 Size() const { return m_uiSize; }
+
+	inline bool IsLocalMemory() const { return PS3GCMALLOCATIONPOOL( m_uType ) < kGcmAllocPoolMainMemory; }
+	inline bool IsRsxMappedMemory() const { return PS3GCMALLOCATIONPOOL( m_uType ) < kGcmAllocPoolMallocMemory; }
+	inline uint8 GcmMemoryLocation() const { return IsLocalMemory() ? CELL_GCM_LOCATION_LOCAL : CELL_GCM_LOCATION_MAIN; }
+
+	#ifndef SPU
+	char * DataInLocalMemory() const;
+	char * DataInMainMemory() const;
+	char * DataInMallocMemory() const;
+	char * DataInAnyMemory() const;
+	#endif
+
+	// Tiled memory access
+	uint32 TiledMemoryTagAreaBase() const;
+	uint32 TiledMemoryIndex() const;
+
+	// Zcull memory access
+	uint32 ZcullMemoryIndex() const;
+	uint32 ZcullMemoryStart() const;
+} ALIGN16_POST;
+
+struct CPs3gcmLocalMemoryBlockSystemGlobal : public CPs3gcmLocalMemoryBlock
+{
+public:
+	CPs3gcmLocalMemoryBlockSystemGlobal() {}
+
+private:
+	// Prevent copying (since patch-back mechanism needs to access the allocated blocks)
+	CPs3gcmLocalMemoryBlockSystemGlobal( CPs3gcmLocalMemoryBlock const &x );
+	CPs3gcmLocalMemoryBlockSystemGlobal& operator =( CPs3gcmLocalMemoryBlockSystemGlobal const &x );
+};
+
+//--------------------------------------------------------------------------------------------------
+// Buffer (used by IB and VBs)
+//--------------------------------------------------------------------------------------------------
+
+struct CPs3gcmBuffer
+{
+	CPs3gcmLocalMemoryBlock m_lmBlock;
+
+public:
+	inline uint32 Offset() { return m_lmBlock.Offset(); }
+public:
+#ifndef SPU
+	static CPs3gcmBuffer * New( uint32 uiSize, CPs3gcmAllocationType_t uType );
+	void Release();
+#endif
+};
+
+#endif // INCLUDED_PS3GCMMEMORY_H
--- a/materialsystem/ps3gcm/ps3gcmstate.cpp
+++ b/materialsystem/ps3gcm/ps3gcmstate.cpp
--- a/materialsystem/ps3gcm/ps3gcmstate.h
+++ b/materialsystem/ps3gcm/ps3gcmstate.h
@@ -0,0 +1,205 @@
+//========== Copyright <20> 2010, Valve Corporation, All rights reserved. ========
+// Global GCM-related state
+//
+
+#ifndef _PS3GCMSTATE_H_INC_
+#define _PS3GCMSTATE_H_INC_
+
+#include "ps3/ps3gcmmemory.h"
+#include <cell/gcm.h>
+#include "bitmap/imageformat.h"
+#include "ps3/ps3_gcm_shared.h"
+
+class CPs3gcmGlobalState
+{
+public:
+	void * m_pIoAddress;		// RSX IO buffer, base address
+	uint32 m_nIoSize;			// RSX IO total size [including CMD buffer]
+	uint32 m_nIoSizeNotPreallocated; // the io total size that wasn't pre-allocated in initialization
+	uint32 m_nCmdSize;			// RSX CMD buffer total size [including first reserved 4K]
+	uint32 const volatile *m_pCurrentCmdBufferSegmentRSX;	// Begin offset of current CMD buffer segment being processed by RSX
+
+#if GCM_CTX_UNSAFE_MODE
+	uint32 *m_pCurrentCmdBufferUnflushedBeginRSX;			// Marks beginning of not yet flushed RSX buffer
+#endif
+
+	void * m_pLocalBaseAddress;	// RSX Local Memory Base Address
+	uint32 m_nLocalBaseOffset;	// cellGcmAddressToOffset( m_pLocalBaseAddress )
+	uint32 m_nLocalSize;		// RSX Local Memory Size
+
+	uint16 m_nRenderSize[2];	// with & height of the render buffer
+	float  m_flRenderAspect;	// aspect ratio of the output device
+	
+	uint32 m_nIoOffsetDelta;   // add this to EA to get Io Offset
+	
+	uint32 m_nSurfaceRenderPitch;
+	
+	// this is used to allocate permanent cmd buffers; to be cleared when level reloads, hopefully won't need anything more complicated than that
+	// but if we do, we can make a page-chain-based (page from 128 bytes) allocator with reference count per page
+	// NOTE: the buffer MUST have 1KB padding in the end to prevent overfetch RSX crash!
+	CellGcmContextData m_cmdBufferPermContext;  
+	
+	// vertex and index data buffer
+	void * m_pRsxDataTransferBuffer;
+	uint32 m_nRsxDataTransferBufferSize;
+
+	// main memory pool buffer
+	void * m_pRsxMainMemoryPoolBuffer;
+	uint32 m_nRsxMainMemoryPoolBufferSize;
+	
+	// special texture to support debug stripes
+	CPs3gcmLocalMemoryBlock m_debugStripeImageBuffer;
+	
+	uint32 m_nCmdBufferRefCount; // how many buffers are referenced?
+
+	CPs3gcmDisplay m_display;	// m_display objects that are created automatically
+	CPs3gcmLocalMemoryBlock m_pShaderPsEmptyBuffer;
+	CgBinaryProgram *m_pShaderPsEmpty;	// empty pixel shader
+	uint32 m_nIoLocalOffsetEmptyFragmentProgramSetupRoutine;
+	
+	uint32 m_nFlushCounter;
+
+	float m_flAllocatorStallTimeWaitingRSX;	// how long allocator ended up waiting for RSX
+
+public:
+	int32 Init();
+	void Shutdown();
+
+	void DrawDebugStripe( uint nScreenX, uint nScreenY, uint nStripeY, uint nStripeWidth, uint nStripeHeight, int nNext = 0 );
+	// pre-allocate memory before command buffer is allocated	
+	void * IoMemoryPrealloc( uint nAlign, uint nSize );
+	void * IoSlackAlloc( uint nAlign, uint nSize );
+	void IoSlackFree( void * eaMemory );
+	bool IsIoMemory( void * eaMemory );
+	
+	uintp CmdBufferToIoOffset( void *pCmdBuffer );
+	CellGcmContextData* CmdBufferAlloc( );
+	void CmdBufferFreeOffset( uint32 );
+
+	enum CmdBufferFlushType_t
+	{
+		kFlushForcefully,
+		kFlushEndFrame
+	};
+	void CmdBufferFlush( CmdBufferFlushType_t eFlushType );
+	void CmdBufferFinish();
+	void CmdBufferReservationCallback( struct CellGcmContextData *context );
+
+	uint32 GetRsxControlNextReferenceValue();
+	
+	// Note:
+	// Height alignment must be 32 for tiled surfaces on RSX
+	//                         128 for Edge Post MLAA 
+	//                          64 for Edge Post MLAA with EDGE_POST_MLAA_MODE_TRANSPOSE_64 flag set
+	uint GetRenderSurfaceBytes( uint nHeightAlignment = 32 ) const { return m_nSurfaceRenderPitch * AlignValue( m_nRenderSize[1], nHeightAlignment ); }
+	
+protected:
+	void CreateDebugStripeTextureBuffer();
+	void CreateEmptyPixelShader();
+	void CreateRsxBuffers();
+	void CreateIoBuffers();
+	int InitVideo();
+	int InitGcm();
+};
+
+
+
+inline uintp CPs3gcmGlobalState::CmdBufferToIoOffset( void * pCmdBuffer )
+{
+	uintp nIoOffset = uintp( pCmdBuffer ) + m_nIoOffsetDelta;
+	Assert( ( uintp( pCmdBuffer ) >= uintp( m_cmdBufferPermContext.begin ) && uintp( pCmdBuffer ) < uintp( m_cmdBufferPermContext.end ) ) // can be a perm context buffer
+		|| ( nIoOffset >= 4096 && nIoOffset <= m_nCmdSize ) ); // or it can be the main cmd buffer (SYSring) 
+	return nIoOffset;
+}
+
+inline CellGcmContextData* CPs3gcmGlobalState::CmdBufferAlloc( )
+{
+	m_nCmdBufferRefCount++;
+	return &m_cmdBufferPermContext;
+}
+
+inline void CPs3gcmGlobalState::CmdBufferFreeOffset( uint32 )
+{
+	if( !--m_nCmdBufferRefCount )
+	{
+		m_cmdBufferPermContext.current = m_cmdBufferPermContext.begin;
+	}
+}
+
+
+
+extern CPs3gcmGlobalState g_ps3gcmGlobalState;
+
+
+//////////////////////////////////////////////////////////////////////////
+//
+// inline implementations of PPU-only stuff
+//
+inline char * CPs3gcmLocalMemoryBlock::DataInLocalMemory() const
+{
+	Assert( IsLocalMemory() );
+	return
+		( m_nLocalMemoryOffset - g_ps3gcmGlobalState.m_nLocalBaseOffset ) +
+		( char * ) g_ps3gcmGlobalState.m_pLocalBaseAddress;
+}
+
+inline char * CPs3gcmLocalMemoryBlock::DataInMainMemory() const
+{
+	Assert( !IsLocalMemory() && IsRsxMappedMemory() );
+	return
+		m_nLocalMemoryOffset +
+		( ( char * ) g_ps3gcmGlobalState.m_pIoAddress );
+}
+
+inline char * CPs3gcmLocalMemoryBlock::DataInMallocMemory() const
+{
+	Assert( !IsLocalMemory() && !IsRsxMappedMemory() );
+	return ( char * ) m_nLocalMemoryOffset;
+}
+
+inline char * CPs3gcmLocalMemoryBlock::DataInAnyMemory() const
+{
+	switch ( PS3GCMALLOCATIONPOOL( m_uType ) )
+	{
+	default: return DataInLocalMemory();
+	case kGcmAllocPoolMainMemory: return DataInMainMemory();
+	case kGcmAllocPoolMallocMemory: return DataInMallocMemory();
+	}
+}
+
+
+
+// Allow shaderapi to query GPU memory stats:
+extern void GetGPUMemoryStats( GPUMemoryStats &stats );
+
+
+class CmdSubBuffer: public CellGcmContextData
+{
+public:
+	static int32_t DoNothing( struct CellGcmContextData *pContext, uint32_t nWords )
+	{
+		Error( "CmdSubBuffer callback @%p: trying to allocate %u words\n", pContext, nWords );
+		return CELL_ERROR_ERROR_FLAG;
+	}
+
+	CmdSubBuffer( uint32 * pBuffer, uint nAllocateWords )
+	{
+		this->current = this->begin = pBuffer; 
+		this->end = this->begin + nAllocateWords;
+		this->callback = DoNothing;
+	}
+	
+	~CmdSubBuffer()
+	{
+		Assert( this->current == this->end );
+	}
+};
+
+extern uint32 CalculateMemorySizeFromCmdLineParam( char const *pCmdParamName, uint32 nDefaultValue, uint32 nMinValue = 0 );
+
+inline bool CPs3gcmGlobalState::IsIoMemory( void * eaMemory )
+{
+	return uintp( eaMemory ) >= uintp( m_pIoAddress ) && uintp( eaMemory ) <= uintp( m_pIoAddress ) + m_nIoSize;
+}
+
+#endif // _PS3GCMSTATE_H_INC_
--- a/materialsystem/ps3gcm/rsxflip.cpp
+++ b/materialsystem/ps3gcm/rsxflip.cpp
@@ -0,0 +1,650 @@
+// Copyright <20> 2010, Valve Corporation, All rights reserved. ========
+
+#include "tier0/platform.h"
+#include "tier0/dbg.h"
+#include "tier1/convar.h"
+#include "ps3/ps3gcmlabels.h"
+#include "ps3gcmstate.h"
+#include "spugcm.h"
+#include "rsxflip.h"
+
+CFlipHandler g_flipHandler;
+
+ConVar r_drop_user_commands( "r_drop_user_commands", "0" );
+ConVar r_ps3_mlaa( "r_ps3_mlaa", "1" ); // 
+
+ConVar r_ps3_vblank_miss_threshold( "r_ps3_vblank_miss_threshold", "0.08", FCVAR_DEVELOPMENTONLY, "How much % of vsync time is allowed after vblank for frames that missed vsync to tear and flip immediately" );
+
+#if GCM_ALLOW_TIMESTAMPS
+int32 g_ps3_timestampBeginIdx = GCM_REPORT_TIMESTAMP_FRAME_FIRST;
+#endif
+
+#if 0 // defined(_DEBUG)
+char ALIGN16 g_flipLog[256][32] ALIGN16_POST;
+uint g_flipLogIdx = 0;
+#define FLIP_LOG(MSG,...)																   \
+{																							\
+	uint nLogIdx = cellAtomicIncr32( &g_flipLogIdx ) & ( ARRAYSIZE( g_flipLog ) - 1 );		 \
+	int nCount = V_snprintf( g_flipLog[nLogIdx], sizeof( g_flipLog[nLogIdx] ), MSG,  ##__VA_ARGS__ ); \
+	int zeroSize = sizeof( g_flipLog[0] ) - 4 - nCount;				   \
+	V_memset( g_flipLog[nLogIdx] + nCount, 0, zeroSize );               \
+	*(uint32*)( g_flipLog[nLogIdx] + sizeof( g_flipLog[0] ) - 4 ) = __mftb(); \
+}
+#define ENABLE_FLIP_LOG 1
+#define FlipAssert( X ) do{if(!(X))DebuggerBreak();}while(false)
+uint g_flipUserCommands[1024][2];
+#else
+#define FLIP_LOG(MSG,...)
+#define FlipAssert( X )
+#define ENABLE_FLIP_LOG 0
+#endif
+
+
+void CEdgePostWorkload::Kick( void * dst, uint nSetLabel )
+{
+	if( !m_isInitialized )
+		return;
+
+	extern ConVar r_ps3_mlaa;
+	FLIP_LOG("mlaa %d,mode=%Xh,label=%d", nSetLabel, g_flipHandler.m_nMlaaFlagsThisFrame, *m_mlaaContext.rsxLabelAddress );
+	edgePostMlaaWait( &m_mlaaContext );
+	FlipAssert( vec_all_eq( *g_spuGcm.m_pMlaaBufferCookie, g_vuSpuGcmCookie ) );
+	//FLIP_LOG("mlaa init %d", nSetLabel );
+	edgePostInitializeWorkload( &m_workload, m_stages, STAGE_COUNT );
+	bool isMlaaRelativeEdgeDetection = true;
+	uint8
+		nMlaaThresholdBase (0x0a), // from Edge sample: these are pretty good threshold values, but you might find better ones...
+		nMlaaThresholdFactor(0x59), 
+		nMlaaAbsoluteThreshold(0x20);
+		
+	uint nWidth = g_ps3gcmGlobalState.m_nRenderSize[0], nHeight = g_ps3gcmGlobalState.m_nRenderSize[1];
+	FlipAssert( nWidth <= 1280 && nWidth >= 640 && nHeight <= 720 && nHeight >= 480 );
+
+	//FLIP_LOG("mlaa prep %d", nSetLabel );
+	edgePostMlaaPrepareWithRelativeThreshold( &m_mlaaContext, g_spuGcm.m_pMlaaBuffer, IsResultInMainMemory()? g_spuGcm.m_pMlaaBufferOut : dst,
+		nWidth, nHeight,
+		g_ps3gcmGlobalState.m_nSurfaceRenderPitch,
+		isMlaaRelativeEdgeDetection?nMlaaThresholdBase:nMlaaAbsoluteThreshold,
+		isMlaaRelativeEdgeDetection?nMlaaThresholdFactor:0,
+		g_flipHandler.m_nMlaaFlagsThisFrame,
+		nSetLabel );
+
+	//FLIP_LOG("mlaa kick %d", nSetLabel );
+	edgePostMlaaKickTasks( &m_mlaaContext );
+	FLIP_LOG("mlaa kicked %d,label=%d", nSetLabel, *m_mlaaContext.rsxLabelAddress );
+	FlipAssert( vec_all_eq( *g_spuGcm.m_pMlaaBufferCookie, g_vuSpuGcmCookie ) );
+}
+
+
+void RsxInterruptFifo::Init()
+{
+	m_nGet = m_nPut = 0;
+}
+
+uint RsxInterruptFifo::Queue( uint8 nCause, uint8 nSurfaceFlipIdx )
+{
+	Event_t event;
+	event.m_nCause = nCause;
+	event.m_nSurfaceFlipIdx = nSurfaceFlipIdx;
+	return Queue( event );
+}
+
+
+uint RsxInterruptFifo::Queue( const Event_t &event )
+{
+	while( ( m_nPut - m_nGet ) >= MAX_EVENT_COUNT - 1 )
+	{
+		sys_timer_usleep( 100 ); // this should NEVER happen
+	}
+
+	#if ENABLE_FLIP_LOG
+	switch( event.m_nCause )
+	{
+	case GCM_USERCMD_POSTPROCESS:
+		FLIP_LOG( "queue:post %d", event.m_nSurfaceFlipIdx );
+		break;
+	case GCM_USERCMD_FLIPREADY:
+		FLIP_LOG( "queue:flip %d sys%d", event.m_nSurfaceFlipIdx, g_flipHandler.m_nSystemFlipId[ event.m_nSurfaceFlipIdx ] );
+		break;
+	default:
+		FLIP_LOG("Unknown event %d", event.m_nCause );
+		break;
+	}
+	#endif
+
+	m_queue[ m_nPut & ( MAX_EVENT_COUNT - 1 ) ] = event;
+	return ++m_nPut; // Should be atomic if there are multiple event producer threads
+}
+
+
+
+
+uint RsxInterruptFifo::GetPutMarker()const
+{
+	return m_nPut;
+}
+
+int RsxInterruptFifo::HasEvents( uint nMarker )
+{
+	uint nGet = m_nGet;
+	Assert( int( nMarker - nGet ) >= 0 );
+	return int( nMarker - nGet );
+}
+
+RsxInterruptFifo::Event_t & RsxInterruptFifo::PeekEvent()
+{
+	uint nGet = m_nGet;
+	Assert( nGet != m_nPut );
+	return m_queue[ nGet & ( MAX_EVENT_COUNT - 1 ) ];
+}
+
+const RsxInterruptFifo::Event_t RsxInterruptFifo::DequeueEvent( )
+{
+	Event_t event = PeekEvent();
+	m_nGet++; // should be atomic if there's more than one consumer
+	return event;
+}
+
+void RsxInterruptFifo::QueueRsxInterrupt()
+{
+	uint32 *pReplace = NULL;
+	#if ENABLE_FLIP_LOG
+	//FLIP_LOG( "q%X", m_nPut );
+	g_flipUserCommands[ m_nPut & ( ARRAYSIZE( g_flipUserCommands ) - 1 ) ][ 0 ] = m_nPut;
+	pReplace = &g_flipUserCommands[ m_nPut & ( ARRAYSIZE( g_flipUserCommands ) - 1 ) ][ 1 ];
+	*pReplace = uint32( gCellGcmCurrentContext->current );
+	#endif
+/*
+	if( IsCert() // don't deliberately drop anything in CERT
+	|| 0 == r_drop_user_commands.GetInt() // don't drop anything if drop==0
+	|| ( ( rand() % 100 ) >= r_drop_user_commands.GetInt() ) // drop 1% means in 99% of cases we still want to SetUserCommand
+	)
+		GCM_FUNC( cellGcmSetUserCommand, m_nPut );
+	GCM_FUNC( cellGcmSetWriteTextureLabel, GCM_LABEL_LAST_INTERRUPT_GET, m_nPut );
+*/
+	// directly putting it to SPUGCM queue instead of routing it through GCM_FUNC
+	g_spuGcm.GetDrawQueue()->Push3( SPUDRAWQUEUE_QUEUE_RSX_INTERRUPT_METHOD | GCM_LABEL_LAST_INTERRUPT_GET, m_nPut, ( uintp )pReplace );
+}
+
+
+
+
+void CFlipHandler::Init()
+{
+	m_interruptFifo.Init();
+	
+/*
+	V_memset( m_nDebugStates, 0, sizeof( m_nDebugStates ) );
+	m_nDebugStates[RENDERING_SURFACE] = -1;
+*/
+
+	m_nFlipSurfaceIdx = 0;
+	m_nFlipSurfaceCount = 0;
+	m_nVblankCounter = 100;  // how many vblanks since the last flip?
+	m_bEdgePostResultAlreadyInLocalMemory = false;
+	m_nMlaaFlagsThisFrame = 0; // disable MLAA before the first BeginScene() is called
+	m_nMlaaFlagMaskNextFrame = ~0u;
+
+	for( int i = 0; i < ARRAYSIZE( m_surfaceEdgePost ) ; ++i ) // initially, the post processing of surfaces is disabled
+		m_surfaceEdgePost[i] = 0;
+
+	// simulated initial state: we just flipped to surface 1, then 2, thus leaving surface 1 (then 0) available to render into
+	// event[1] may not be set for MLAA mode because in order to start rendering into surface 0 (which we're rendering into), we "waited" for event 1
+
+	for ( int j = 2; j < ARRAYSIZE( m_evFlipReady ); ++ j )
+		m_evFlipReady[j].Set();
+	
+	//m_nLastFlippedSurfaceIdx = CPs3gcmDisplay::SURFACE_COUNT - 1 ;
+	m_pLastInterruptGet = cellGcmGetLabelAddress( GCM_LABEL_LAST_INTERRUPT_GET );
+	*m_pLastInterruptGet = 0;
+	
+
+	cellGcmSetVBlankHandler( INTERRUPT_VBlankHandler );
+	cellGcmSetUserHandler( INTERRUPT_UserHandler );
+}
+
+
+void CFlipHandler::Shutdown()
+{
+	cellGcmSetVBlankHandler( NULL );
+	cellGcmSetUserHandler( NULL );
+}
+
+//////////////////////////////////////////////////////////////////////////
+// 1. draw PS/3 system menus into the surface
+// 2. queue a reliable "flip ready" event for GCM interrupt thread to process and flip surface to this 
+//
+void CFlipHandler::QmsPrepareFlipSubmit( GcmUserCommandEnum_t nEvent, uint surfaceFlipIdx )
+{
+	uint32 nSystemFlipId = GCM_FUNC_NOINLINE( cellGcmSetPrepareFlip, surfaceFlipIdx ); 
+	m_nSystemFlipId[surfaceFlipIdx] = nSystemFlipId;
+	Assert( !m_evFlipReady[ surfaceFlipIdx ].Check() );
+	m_interruptFifo.Queue( nEvent, surfaceFlipIdx );
+}
+
+
+ConVar r_ps3_mlaa_pulse( "r_ps3_mlaa_pulse", "0" );
+enum EdgePostFlags_t {
+	EDGE_POST_MLAA_FLAG_MASK = ( EDGE_POST_MLAA_MODE_ENABLED | EDGE_POST_MLAA_MODE_SHOW_EDGES | EDGE_POST_MLAA_MODE_SINGLE_SPU_TRANSPOSE | EDGE_POST_MLAA_MODE_TRANSPOSE_64 )
+};
+
+
+void CFlipHandler::BeginScene()
+{
+#if GCM_ALLOW_TIMESTAMPS
+	if ( g_ps3_timestampBeginIdx >= 0 )
+	{
+		GCM_FUNC( cellGcmSetTimeStamp, g_ps3_timestampBeginIdx );
+		g_ps3_timestampBeginIdx = -1;
+	}
+#endif
+	m_nMlaaFlagsThisFrame = r_ps3_mlaa.GetInt() & EDGE_POST_MLAA_FLAG_MASK;
+	if( int nPulse = r_ps3_mlaa_pulse.GetInt() )
+	{
+		if( 1 & ( g_spuGcm.m_nFrame / nPulse ) )
+		{
+			m_nMlaaFlagsThisFrame = 0; // disable for 16 frames = 1/2 second 
+		}
+	}
+	m_nMlaaFlagsThisFrame &= m_nMlaaFlagMaskNextFrame;
+
+	//m_nMlaaFlagMaskNextFrame = (uint)-1;
+}
+
+void CFlipHandler::TransferMlaaResultIfNecessary( uint nSurfacePrevFlipIdx )
+{
+	if( m_bEdgePostResultAlreadyInLocalMemory )
+		return;
+	
+	if( g_edgePostWorkload.ShouldUseLabelForSynchronization() )
+	{
+		GCM_FUNC( cellGcmSetWaitLabel, GCM_LABEL_EDGEPOSTMLAA, nSurfacePrevFlipIdx );
+	}
+	else
+	{
+		// wait for SPU to finish post-processing previous surface
+		uint32 *pPrevJts = &g_spuGcm.m_pEdgePostRsxLock[ nSurfacePrevFlipIdx ];
+		if( *pPrevJts != CELL_GCM_RETURN() )
+		{
+			GCM_FUNC( cellGcmSetCallCommand, uintp( pPrevJts ) + g_spuGcmShared.m_nIoOffsetDelta );
+		}
+	}
+
+	//
+	// NOTE: we can start post-processing before SetPrepareFlip, it only makes sense since we don't always use interrupt to do so
+	// if we ever do proper synchronization with SPU workload, we should kick Edge Post here, before SetPrepareFlip
+	//
+
+	if( g_edgePostWorkload.IsResultInMainMemory() )
+	{
+		CPs3gcmLocalMemoryBlockSystemGlobal & prevSurfaceColor = g_ps3gcmGlobalState.m_display.surfaceColor[nSurfacePrevFlipIdx];
+		GCM_FUNC( cellGcmSetTransferImage, CELL_GCM_TRANSFER_MAIN_TO_LOCAL,
+			prevSurfaceColor.Offset(), g_ps3gcmGlobalState.m_nSurfaceRenderPitch, 0, 0,			
+			uintp( g_spuGcm.m_pMlaaBufferOut ) + g_ps3gcmGlobalState.m_nIoOffsetDelta, g_ps3gcmGlobalState.m_nSurfaceRenderPitch, 0, 0,
+			g_ps3gcmGlobalState.m_nRenderSize[0], g_ps3gcmGlobalState.m_nRenderSize[1], 
+			4 );
+	}
+	m_bEdgePostResultAlreadyInLocalMemory = true;
+}
+
+
+
+
+bool CFlipHandler::QmsAdviceBeforeDrawPrevFramebuffer()
+{
+	uint nSurfacePrevFlipIdx = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex( 1 );
+	uint8 prevPostProcessed = m_surfaceEdgePost[nSurfacePrevFlipIdx];
+	if( prevPostProcessed ) // did previous surface need post-processing?
+	{
+		// we'd actually be free to start MLAA here instead of in Flip, for the cost of one more RSX->PPU interrupt
+		// but we don't do that because we only may do so when the LAST player draws, and we don't know if this post processing
+		// that will now start is related to the LAST player
+
+		// we don't need to do that until flip if we're using deferred queue
+		// although if we're using deferred queue and we run out of space there, we stop using it, replay it and start defer-render into previous frame
+		TransferMlaaResultIfNecessary( nSurfacePrevFlipIdx ); 
+	
+		// do the post-processing on this frame, in the mean time render into previous frame
+		return true;
+
+	}
+	return false; // there's no need to switch surfaces now
+}
+
+
+
+void CFlipHandler::Flip()
+{
+#if GCM_ALLOW_TIMESTAMPS
+	OnFrameTimestampAvailableMST( 1.0f );
+#endif
+
+	extern ConVar mat_vsync;
+	m_bVSync = mat_vsync.GetBool();
+
+	g_ps3gcmGlobalState.CmdBufferFlush( CPs3gcmGlobalState::kFlushForcefully );
+	g_spuGcm.GetDrawQueue()->Push1( SPUDRAWQUEUE_FRAMEEVENT_METHOD | SDQFE_END_FRAME );
+
+	uint surfaceFlipIdx = g_ps3gcmGlobalState.m_display.surfaceFlipIdx, nSurfaceNextFlipIdx = g_ps3gcmGlobalState.m_display.NextSurfaceIndex( 1 ), nSurfaceAfterNextFlipIdx = g_ps3gcmGlobalState.m_display.NextSurfaceIndex( 2 ), nSurfacePrevFlipIdx = g_ps3gcmGlobalState.m_display.PrevSurfaceIndex( 1 );
+	
+/*	
+	uint nScreenWidth = g_ps3gcmGlobalState.m_nRenderSize[0];
+	uint nScreenY = 40;
+	g_ps3gcmGlobalState.DrawDebugStripe( nScreenWidth * surfaceFlipIdx / 3, nScreenY, 0, nScreenWidth / 3, 4 );
+	g_ps3gcmGlobalState.DrawDebugStripe( ( g_spuGcm.m_nFrame & 0xF ) * ( nScreenWidth / 16 ), 34, 0, ( nScreenWidth / 16 ) * ( 1 + m_nFlipSurfaceCount ), 1 );
+*/	
+	// let interrupt know we're ready to post-process the new frame, and we wanna flip the previous frame
+
+	//g_ps3gcmGlobalState.CmdBufferFinish();
+	uint32 * pThisJts = g_spuGcm.m_pEdgePostRsxLock + surfaceFlipIdx; // may be NULL + idx
+	Assert( !g_spuGcm.m_pEdgePostRsxLock || *pThisJts == CELL_GCM_RETURN() );
+	
+	uint8 prevPostProcessed = m_surfaceEdgePost[nSurfacePrevFlipIdx];
+	uint8 thisPostProcess = g_spuGcm.m_pMlaaBuffer ? ( uint8 ) ( m_nMlaaFlagsThisFrame & EDGE_POST_MLAA_FLAG_MASK ): 0 ;
+	
+	if( prevPostProcessed ) // did previous surface need post-processing?
+	{
+		TransferMlaaResultIfNecessary( nSurfacePrevFlipIdx );
+		//if( g_spuGcm.m_bUseDeferredDrawQueue )
+		{
+			// now is the time to execute all the deferred commands, if there are any
+			// NOTE: this will often do nothing , because current frame would've flushed previous frame deferred commands already
+			//       right before starting writing its own
+			g_spuGcm.ExecuteDeferredDrawQueue( 1 );
+		}
+		//g_ps3gcmGlobalState.DrawDebugStripe( nScreenWidth * surfaceFlipIdx / 3, 44, surfaceFlipIdx, nScreenWidth / 3, 2, -1 );
+		
+		// prepare flip of previous frame - Edge Post processed buffer
+		// the previous frame was post-processed; we'll prepare flip on it. 
+		QmsPrepareFlipSubmit( GCM_USERCMD_FLIPREADY, nSurfacePrevFlipIdx );
+	}
+	else
+	{
+		// if previous frame wasn't post-processed, don't flip it because we don't want to flip the same framebuffer twice (although we probably could)
+		// so we don't have anything to flip here, but have a frame to post-process
+		g_spuGcm.ExecuteDeferredDrawQueue( 1 );
+	}
+
+	m_surfaceEdgePost[surfaceFlipIdx] = thisPostProcess; // is post-process required for this surface ?
+	if( thisPostProcess )
+	{
+		if( !( m_nMlaaFlagsThisFrame & EDGE_POST_MLAA_MODE_ENABLED ) )
+		{
+			m_bEdgePostResultAlreadyInLocalMemory = true; // don't attempt to transfer the results; we don't _really_ do edge post processing, so we consider the results are in memory already
+		}
+		else
+		{
+			// EDGE POST TODO: JTS - the previous EdgePost must release it. To avoid overwriting edge post buffer before it finished tranferring back to local memory
+			// to release JTS from the future, we can use a separate ring buffer "JTS-RET" sequences and just call into it here.
+			// or we can wait for a label and set it from SPU
+			// as a simplification, we can just wait for edge post to finish synchronously on ppu
+			// we can also use a mutex of sorts and insert JTS here only when edge post is not finished yet
+
+			// we only can start transferring the image after the SPU is done streaming previous frame (if previous frame was post-processed)
+			// so wait for SPU to release previous frame, if it was post-processed.
+
+			// Also, if SPU didn't finish post-processing, then we need to synchronize (wait on RSX for SPU to be done)
+			// but in many cases SPU will be done by now, so we don't need to spend 900+ns in RSX front-end on CALL+RET
+
+			if( !g_edgePostWorkload.ShouldUseLabelForSynchronization() )
+			{
+				*pThisJts = CELL_GCM_JUMP( uintp( pThisJts ) + g_spuGcmShared.m_nIoOffsetDelta ); // this will be  JTS for SPU to overwrite when post-processing of this frame is done
+			}
+
+			CPs3gcmLocalMemoryBlockSystemGlobal & surfaceColor = g_ps3gcmGlobalState.m_display.surfaceColor[surfaceFlipIdx];
+			GCM_FUNC( cellGcmSetTransferImage, CELL_GCM_TRANSFER_LOCAL_TO_MAIN,
+				uintp( g_spuGcm.m_pMlaaBuffer ) + g_ps3gcmGlobalState.m_nIoOffsetDelta, g_ps3gcmGlobalState.m_nSurfaceRenderPitch, 0, 0,
+				surfaceColor.Offset(), g_ps3gcmGlobalState.m_nSurfaceRenderPitch, 0, 0,			
+				g_ps3gcmGlobalState.m_nRenderSize[0], g_ps3gcmGlobalState.m_nRenderSize[1], 
+				4 );
+
+			// This frame was rendered and transferred to main memory; we'll let interrupt thread know it's ready for Edge Post processing
+			m_interruptFifo.Queue( GCM_USERCMD_POSTPROCESS, surfaceFlipIdx );
+			m_bEdgePostResultAlreadyInLocalMemory = false;
+		}
+	}
+	else
+	{
+		// we aren't post-processing this frame, so we need to just prepare flip and flip this framebuffer
+		g_spuGcm.ExecuteDeferredDrawQueue( 0 );
+		QmsPrepareFlipSubmit( GCM_USERCMD_FLIPREADY, surfaceFlipIdx );
+		m_bEdgePostResultAlreadyInLocalMemory = true; // don't attempt to transfer the results; we don't do edge post - processing, so we consider the results are in memory already
+	}
+	
+	g_spuGcm.FlipDeferredDrawQueue( );
+
+	if( thisPostProcess && !prevPostProcessed )
+	{
+		// we absolutely MUST reset RSX state before the next frame. 
+		// QmsPrepareFlipSubmit() does that by definition, but if we don't call it in this Flip (i.e. when !prevPostProcessed && thisPostProcess)
+		// we must FORCE RSX state reset
+		g_spuGcm.GetDrawQueue()->Push1( SPUDRAWQUEUE_RESETRSXSTATE_METHOD );
+	}
+
+
+#if GCM_ALLOW_TIMESTAMPS
+	{
+		// The current frame has just finished, insert a timestamp instruction right before flip
+		GCM_FUNC( cellGcmSetTimeStamp, surfaceFlipIdx * 2 + GCM_REPORT_TIMESTAMP_FRAME_FIRST + 1 );
+		g_ps3_timestampBeginIdx = nSurfaceNextFlipIdx * 2 + GCM_REPORT_TIMESTAMP_FRAME_FIRST;
+	}
+#endif
+	m_interruptFifo.QueueRsxInterrupt();
+	g_ps3gcmGlobalState.CmdBufferFlush( CPs3gcmGlobalState::kFlushEndFrame );
+	//g_ps3gcmGlobalState.CmdBufferFinish();
+
+	//
+	// Make sure that the next framebuffer is free to render into. For that to be so,
+	// the flip should happen from the next to the buffer after next. When that happens, 
+	// the TV shows the buffer after next, and the next buffer is not visible to the user, 
+	// so it's allowed to render into the next buffer.
+	//
+	FLIP_LOG( "ev Wait %d", nSurfaceAfterNextFlipIdx );
+	m_evFlipReady[ nSurfaceAfterNextFlipIdx ].Wait();
+	m_evFlipReady[ nSurfaceAfterNextFlipIdx ].Reset();
+	FLIP_LOG( "Draw %d, ev Reset %d", nSurfaceNextFlipIdx, nSurfaceAfterNextFlipIdx );
+
+#if GCM_ALLOW_TIMESTAMPS
+	{
+		// Since the previous flip completely finished, we can grab its timestamps now
+		uint32 uiLastFrameTimestampIdx = ( nSurfaceAfterNextFlipIdx ) * 2 + GCM_REPORT_TIMESTAMP_FRAME_FIRST;
+		uint64 uiStartTimestamp = cellGcmGetTimeStamp( uiLastFrameTimestampIdx );
+		uint64 uiEndTimestamp = cellGcmGetTimeStamp( uiLastFrameTimestampIdx + 1 );
+		uint64 uiRsxTimeInNanoSeconds = uiEndTimestamp - uiStartTimestamp;
+
+		OnFrameTimestampAvailableRsx( uiRsxTimeInNanoSeconds / 1000000.0f );
+	}
+#endif
+}
+
+
+bool IsRsxReadyForNoninteractiveRefresh( )
+{
+	uint nSurfaceAfterNextFlipIdx = g_ps3gcmGlobalState.m_display.NextSurfaceIndex( 2 );
+	return g_flipHandler.m_evFlipReady[ nSurfaceAfterNextFlipIdx ].Check();
+	
+	// if we are 3 vblanks past last flip already, another refresh would be welcome ; if we have no surfaces to flip in this case, we are most likely ready to flip right away
+	// another thing to check is the interrupt FIFO: if it's not idle, let's just postpone being ready
+	// return g_flipHandler.m_nVblankCounter > 3 && g_flipHandler.m_nFlipSurfaceCount == 0 && g_flipHandler.m_interruptFifo.IsIdle();
+}
+
+
+
+void CFlipHandler::TryFlipVblank()
+{
+	// artificially simulate an interrupt for cause, because there's suspicion it was dropped
+	//
+	// only attempt to generate artificial interrupts if our ready flip queue is empty, otherwise there's no need
+	// to tap the narrow 15.6Mb/s bus
+
+	uint nMarker = *g_flipHandler.m_pLastInterruptGet;
+	m_nVblankCounter ++;
+	#if ENABLE_FLIP_LOG
+	static int m_nLastFlipLogIdx = 0;
+	if( m_nLastFlipLogIdx != g_flipLogIdx )
+	{
+		V_snprintf( g_flipLog[m_nLastFlipLogIdx], sizeof( g_flipLog[m_nLastFlipLogIdx] ), "%X.Vblanks ..%d", nMarker, m_nVblankCounter );
+	}
+	else
+	{
+		m_nLastFlipLogIdx = cellAtomicIncr32( &g_flipLogIdx ) & ( ARRAYSIZE( g_flipLog ) - 1 );
+		V_snprintf( g_flipLog[m_nLastFlipLogIdx], sizeof( g_flipLog[m_nLastFlipLogIdx] ), "%X.Vblank %d", nMarker, m_nVblankCounter );
+	}
+	#endif
+	TryPumpEvents( nMarker, 1 );
+}
+
+
+
+bool CFlipHandler::TryFlipSurface( uint isVblank )
+{
+	if( !m_nFlipSurfaceCount )
+	{
+		return false;
+	}
+	if( m_bVSync )
+	{
+		if( m_nVblankCounter < m_nPresentFrequency )
+		{
+			//FLIP_LOG( "no flip: %d vblanks", m_nVblankCounter, m_nPresentFrequency );
+			return false;
+		}
+
+		if( !isVblank )
+		{
+			double flVSyncInterval = m_flVBlankTimestamp - m_flVBlankTimestamp0, flMissThreshold = r_ps3_vblank_miss_threshold.GetFloat() * flVSyncInterval;
+			double flMiss = Plat_FloatTime() - m_flVBlankTimestamp;
+			if ( flMiss > flMissThreshold )
+			{
+				FLIP_LOG("no flip: %.2fms miss", flMiss * 1000 );
+				return false; // wait for another vsync, missed by too much
+			}
+		}
+	}
+
+	// flip the surface immediately 
+	uint nSystemFlipId = m_nSystemFlipId[ m_nFlipSurfaceIdx ];
+	cellGcmSetFlipImmediate( nSystemFlipId );
+
+#ifdef GCM_ALLOW_TIMESTAMPS
+	// Collect time since previous flip
+	double flFlipImmediateTimestamp = Plat_FloatTime();
+	OnFrameTimestampAvailableFlip( ( flFlipImmediateTimestamp - m_flFlipImmediateTimestamp ) * 1000.0f );
+	m_flFlipImmediateTimestamp = flFlipImmediateTimestamp;
+#endif
+
+	FLIP_LOG( isVblank ? "vFlip%u, ev Set %u" : "_Flip%u, ev Set %u", nSystemFlipId, m_nFlipSurfaceIdx );
+	// Release PPU QMS thread waiting for this flip
+	m_evFlipReady[m_nFlipSurfaceIdx].Set();
+
+	m_nFlipSurfaceIdx = ( m_nFlipSurfaceIdx + 1 ) % CPs3gcmDisplay::SURFACE_COUNT;
+	m_nFlipSurfaceCount--;
+	m_nVblankCounter = 0;
+	return true;
+}
+
+
+void CFlipHandler::TryPumpEvents( uint nMarker, uint isVblank )
+{
+	if ( m_mutexOfInterruptThread.TryLock() )
+	{
+		PumpEventsUnsafe( nMarker );
+		TryFlipSurface( isVblank ); // this will often be duplicate call		
+		g_flipHandler.m_mutexOfInterruptThread.Unlock();
+	}
+}
+
+void CFlipHandler::PumpEventsUnsafe( uint nMarker )
+{
+	while( m_interruptFifo.HasEvents( nMarker ) )
+	{
+		if( !OnRsxInterrupt( m_interruptFifo.DequeueEvent() ) )
+			break;
+	}
+}
+
+bool RsxInterruptFifo::IsValidMarker( uint nMarker )
+{
+	return ( nMarker - m_nGet ) <= MAX_EVENT_COUNT;
+}
+
+
+bool CFlipHandler::OnRsxInterrupt( const RsxInterruptFifo::Event_t event )
+{
+	switch( event.m_nCause )
+	{
+	case GCM_USERCMD_POSTPROCESS:
+		{
+			// start edge post processing phase here; we can't do the flip yet because we didn't post-process the buffer yet
+
+			// Simulating MLAA job running and adding the cause to the end of the array some time in the nearest (4-5ms) future
+			void * pColorSurface = g_ps3gcmGlobalState.m_display.surfaceColor[event.m_nSurfaceFlipIdx].DataInLocalMemory();
+			if( true )
+			{
+				// g_spuGcm.SyncMlaa();
+				g_edgePostWorkload.Kick( pColorSurface, event.m_nSurfaceFlipIdx );
+			}
+			else
+			{
+				FLIP_LOG( "mlaa sync %d", event.m_nSurfaceFlipIdx );
+				g_spuGcm.SyncMlaa( pColorSurface );
+				g_spuGcm.m_pEdgePostRsxLock[event.m_nSurfaceFlipIdx] = CELL_GCM_RETURN(); // this will be poked by the SPU job
+			}
+		}
+		break;
+
+	case GCM_USERCMD_FLIPREADY:
+		FlipAssert( ( m_nFlipSurfaceIdx + m_nFlipSurfaceCount ) % CPs3gcmDisplay::SURFACE_COUNT == event.m_nSurfaceFlipIdx );
+		FLIP_LOG( "flip ready %d:sys%d", event.m_nSurfaceFlipIdx, m_nSystemFlipId[event.m_nSurfaceFlipIdx] );
+		m_nFlipSurfaceCount++;
+		break;
+	}
+
+	return true;
+}
+
+void CFlipHandler::INTERRUPT_VBlankHandler( const uint32 head )
+{
+	double flVBlankTimestampSave = g_flipHandler.m_flVBlankTimestamp;
+	g_flipHandler.m_flVBlankTimestamp = Plat_FloatTime();
+	g_flipHandler.m_flVBlankTimestamp0 = flVBlankTimestampSave;
+	g_flipHandler.TryFlipVblank( );
+}
+
+void CFlipHandler::INTERRUPT_UserHandler( const uint32 nMarker )
+{
+	if( g_flipHandler.m_interruptFifo.IsValidMarker( nMarker ) )
+	{
+		//FLIP_LOG( "%X.UserInterrupt", nMarker );
+		g_flipHandler.TryPumpEvents( nMarker, 0 );
+	}
+	else
+	{
+		// invalid  marker: this marker has already happened; skip it
+		//FLIP_LOG( "%X.ERROR.UserInterrupt", nMarker );
+		DebuggerBreak();
+	}
+}
+
+void Ps3gcmFlip_SetFlipPresentFrequency( int nNumVBlanks )
+{
+	if ( g_flipHandler.m_nPresentFrequency != nNumVBlanks )
+	{
+		nNumVBlanks = MAX( 1, nNumVBlanks );
+		nNumVBlanks = MIN( 12, nNumVBlanks );
+		if ( g_flipHandler.m_nPresentFrequency != nNumVBlanks )
+		{
+			g_flipHandler.m_nPresentFrequency = nNumVBlanks;
+		}
+	}
+}
+
+
+
+/*
+void CFlipHandler::OnState( int nState, int nValue )
+{
+	m_nDebugStates[nState] = nValue;
+	if( m_nDebugStates[RENDERING_SURFACE] == m_nDebugStates[DISPLAYING_SURFACE] )
+		DebuggerBreak();
+}*/
--- a/materialsystem/ps3gcm/rsxflip.h
+++ b/materialsystem/ps3gcm/rsxflip.h
@@ -0,0 +1,122 @@
+//========== Copyright <20> 2010, Valve Corporation, All rights reserved. ========
+#ifndef MATERIALSYSTEM_PS3GCM_RSXFLIP_HDR
+#define MATERIALSYSTEM_PS3GCM_RSXFLIP_HDR
+
+#ifndef _CERT
+#define GCM_ALLOW_TIMESTAMPS 1
+void OnFrameTimestampAvailableFlip( float ms );
+void OnFrameTimestampAvailableRsx( float ms );
+void OnFrameTimestampAvailableMain( float ms );
+void OnFrameTimestampAvailableMST( float ms );
+extern int32 g_ps3_timestampBeginIdx;
+#endif
+
+#include "ps3/ps3gcmmemory.h"
+
+class RsxInterruptFifo
+{
+public:
+	struct Event_t
+	{
+		uint8 m_nCause;
+		uint8 m_nSurfaceFlipIdx;
+		
+	};
+protected:
+	enum { MAX_EVENT_COUNT = 0x80 };
+	volatile uint m_nGet;
+	uint m_nPut;
+	Event_t m_queue[MAX_EVENT_COUNT];
+public:
+	void Init();
+
+	uint Queue( uint8 nCause, uint8 nSurfaceFlipIdx );
+	uint Queue( const Event_t &event );
+	uint GetPutMarker()const;
+	int HasEvents( uint nMarker );
+	bool IsIdle()const { return m_nPut == m_nGet;}
+	bool IsValidMarker( uint nMarker );
+	Event_t & PeekEvent();
+	const Event_t DequeueEvent( );
+	void QueueRsxInterrupt();
+};
+
+
+
+class CFlipHandler
+{
+public:
+	void Init();
+	void Shutdown();
+	void Flip();
+	void BeginScene();
+	void EndScene(){}
+	bool OnRsxInterrupt( const RsxInterruptFifo::Event_t event );
+	void TryFlipVblank();
+	void TryPumpEvents( uint nMarker, uint isVblank );
+	void QmsPrepareFlipSubmit( GcmUserCommandEnum_t nEvent, uint surfaceFlipIdx );
+	bool QmsAdviceBeforeDrawPrevFramebuffer();
+	void DisableMlaa(){ m_nMlaaFlagsThisFrame = 0; }
+	void DisableMlaaPermannetly(){ m_nMlaaFlagMaskNextFrame = 0; }
+	void EnableMlaaPermannetly(){ m_nMlaaFlagMaskNextFrame = ~0u; }
+	//void DisableMlaaForTwoFrames(){ m_nMlaaFlagsThisFrame = m_nMlaaFlagMaskNextFrame = 0; }
+	int IsMlaaEnabled()const { return m_nMlaaFlagsThisFrame; }
+	
+	enum DebugStateEnum_t
+	{
+		RENDERING_SURFACE,
+		DISPLAYING_SURFACE,
+		DEBUG_STATE_COUNT
+	};
+
+	//void OnState( int nState, int nValue );
+
+public:
+	static void INTERRUPT_VBlankHandler( const uint32 head );
+	static void INTERRUPT_UserHandler( const uint32 cause );
+
+	void PumpEventsUnsafe( uint nMarker );
+	bool TryFlipSurface( uint isVblank );
+protected:
+	void TransferMlaaResultIfNecessary( uint nSurfacePrevFlipIdx );
+
+public:
+	//int m_nDebugStates[DEBUG_STATE_COUNT];
+
+
+	// How often to present in terms of vblanks?
+	// (@60Hz scanout TV: 1 = 60 Hz = every vblank, 2 = 30 Hz = every other vblank, 3 = 20 Hz = every 3rd vblank)
+	// (@50Hz PAL TV: 1 = 50 Hz = every vblank, 2 = 25 Hz = every other vblank, 3 = 17 Hz = every 3rd vblank)
+	int m_nPresentFrequency;
+
+	// Interrupt-driven data
+#ifdef GCM_ALLOW_TIMESTAMPS
+	double m_flFlipImmediateTimestamp;
+#endif
+	double m_flVBlankTimestamp, m_flVBlankTimestamp0;
+
+	// Mutex to sync with interrupt thread
+	CThreadMutex m_mutexOfInterruptThread;
+	CThreadManualEvent m_evFlipReady[ CPs3gcmDisplay::SURFACE_COUNT ];
+	uint m_nFlipSurfaceIdx, m_nFlipSurfaceCount; // the next surface to flip, count of surfaces to flip
+	uint m_nSystemFlipId[ CPs3gcmDisplay::SURFACE_COUNT ];
+	//uint m_nLastFlippedSurfaceIdx; // used to check for duplicate TryFlip callbacks
+	uint m_nVblankCounter;
+	uint32 * m_pLastInterruptGet;
+	RsxInterruptFifo m_interruptFifo;
+	uint8 m_surfaceEdgePost[CPs3gcmDisplay::SURFACE_COUNT]; // true when the corresponding surface must be post-processed 
+	// VSync enabled?
+	// true		= Syncronize with VSync = true
+	// false	= Syncronize with every HSync scanline
+	bool m_bVSync;
+	bool m_bEdgePostResultAlreadyInLocalMemory;
+	int m_nMlaaFlagsThisFrame;
+	int m_nMlaaFlagMaskNextFrame;
+};
+
+extern CFlipHandler g_flipHandler;
+
+
+
+
+#endif
--- a/materialsystem/ps3gcm/rsxspudoublering_driver.cpp
+++ b/materialsystem/ps3gcm/rsxspudoublering_driver.cpp
@@ -0,0 +1,4 @@
+//================ Copyright (c) 1996-2009 Valve Corporation. All Rights Reserved. =================
+#include "dxabstract.h"
+
+#include "common/ps3/rsx_spu_double_ring.cpp"
--- a/materialsystem/ps3gcm/shader_ps_empty.bat
+++ b/materialsystem/ps3gcm/shader_ps_empty.bat
@@ -0,0 +1 @@
+sce-cgc --mnvb -p sce_fp_rsx -o shader_ps_empty.bin shader_ps_empty.cg
--- a/materialsystem/ps3gcm/shader_ps_empty.bin
+++ b/materialsystem/ps3gcm/shader_ps_empty.bin
--- a/materialsystem/ps3gcm/shader_ps_empty.cg
+++ b/materialsystem/ps3gcm/shader_ps_empty.cg
@@ -0,0 +1,7 @@
+void main
+(
+	out float4 c : COLOR
+)
+{
+	c.rgba = float4( 1, 0, 1, 1 );
+}
--- a/materialsystem/ps3gcm/shader_ps_empty.h
+++ b/materialsystem/ps3gcm/shader_ps_empty.h
@@ -0,0 +1,11 @@
+  0x00, 0x00, 0x1B, 0x5C, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xB0, 0x00, 0x00, 0x00, 0x01
+, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x80
+, 0x00, 0x00, 0x04, 0x18, 0x00, 0x00, 0x0A, 0xC5, 0x00, 0x00, 0x10, 0x05, 0xFF, 0xFF, 0xFF, 0xFF
+, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x50
+, 0x00, 0x00, 0x10, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00
+, 0x43, 0x4F, 0x4C, 0x4F, 0x52, 0x00, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF
+, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+, 0x1E, 0x7E, 0x7E, 0x00, 0xC8, 0x00, 0x1C, 0x9D, 0xC8, 0x00, 0x00, 0x01, 0xC8, 0x00, 0x00, 0x01
+, 0x1E, 0x01, 0x01, 0x00, 0x28, 0x02, 0x1C, 0x9C, 0xC8, 0x00, 0x00, 0x01, 0xC8, 0x00, 0x00, 0x01
+, 0x00, 0x00, 0x3F, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
--- a/materialsystem/ps3gcm/spudrawqueue.cpp
+++ b/materialsystem/ps3gcm/spudrawqueue.cpp
@@ -0,0 +1,210 @@
+//========== Copyright <20> Valve Corporation, All rights reserved. ========
+#include "tier0/memalloc.h"
+#include "ps3/ps3_gcm_config.h"
+#include "spudrawqueue.h"
+#include "ps3gcmstate.h"
+
+
+void SpuDrawQueue::Init( uint nBufferSize, uint32 * pSignal, FnFlushCallback_t fnFlushCallback, FnStallCallback_t fnStallCallback )
+{
+	if( nBufferSize < 2 * DRAWQUEUE_LSRING_SIZE )
+	{
+		Warning("SpuDrawQueue requested size (%d bytes) is too small (must be at least %d), auto-adjusting\n", nBufferSize, 2 * DRAWQUEUE_LSRING_SIZE );
+		nBufferSize = 2 * DRAWQUEUE_LSRING_SIZE;
+	}
+	m_pBuffer =  ( uint32* ) g_ps3gcmGlobalState.IoSlackAlloc( 128, nBufferSize );
+	m_pBufferEnd = AddBytes( m_pBuffer, nBufferSize & -16 );
+	m_pPut = m_pGet = m_pBuffer;
+	
+	*pSignal = GetSignal();
+	m_pSignal = pSignal;
+
+	m_fnFlushCallback = fnFlushCallback;
+	m_fnStallCallback = fnStallCallback;
+	m_fnFlushCallbackStack = NULL;
+	
+	#ifdef _DEBUG
+	m_nAllocBreakAddress = NULL;
+	m_nAllocCount = m_nCollectCount = 0;
+	m_nAllocBreak = m_nCollectBreak = 0;
+	#endif
+	m_nAllocWords = 0;
+	
+	m_pFlushWatermark = AddBytes( m_pBuffer, DRAWQUEUE_LSRING_SIZE );
+	if( m_pFlushWatermark + 8 >= m_pBufferEnd )
+	{
+		Error( "SpuDrawQueue misconfiguration: allocated buffer of %d bytes, but LS watermark size is %d bytes. Increase the main memory buffer size to avoid PPU deadlocks\n", nBufferSize, DRAWQUEUE_LSRING_SIZE );
+	}
+}
+
+void SpuDrawQueue::PushFlushCallback( FnFlushCallback_t fnNewCallback )
+{
+	Assert( !m_fnFlushCallbackStack );
+	m_fnFlushCallbackStack = m_fnFlushCallback;
+	m_fnFlushCallback = fnNewCallback;
+}
+
+void SpuDrawQueue::PopFlushCallback()
+{
+	Assert( m_fnFlushCallbackStack );
+	m_fnFlushCallback = m_fnFlushCallbackStack;
+	m_fnFlushCallbackStack = NULL;
+}
+
+
+
+void SpuDrawQueue::Shutdown()
+{
+	g_ps3gcmGlobalState.IoSlackFree( m_pBuffer );
+}
+
+
+void SpuDrawQueue::UnallocToAlign()
+{
+	m_pPut = ( uint32* )( uintp( m_pPut ) & -16 );
+}
+
+//////////////////////////////////////////////////////////////////////////
+// REENTRANT: m_fnFlushCallback can in turn call AllocWords with a small number of words
+//
+uint32 *SpuDrawQueue::AllocWords( uint nWords /*, uint nAlignMask, uint nAlignValue*/ )
+{
+#ifdef _DEBUG
+	uint32 * pSavePut = m_pPut, *pSaveGet = m_pGet;(void)(pSavePut, pSaveGet);
+	m_nAllocCount++;
+	if( m_nAllocCount == m_nAllocBreak )
+		DebuggerBreak();
+#endif
+	Assert( nWords * sizeof( uint32 ) <= SPUDRAWQUEUE_NOPCOUNT_MASK );
+	uint32 * pOldPut = m_pPut, * pAllocation = pOldPut;//( uint32* )( uintp( pOldPut ) + ( ( nAlignValue - uintp( pOldPut ) ) & nAlignMask ) );
+	uint32 * pNewPut = pAllocation + nWords;
+	bool bWrap = false;
+
+	if( pNewPut > m_pBufferEnd ) // do we need to wrap?
+	{
+		//we have to wrap...
+		if( m_pPut < m_pBufferEnd )
+			*m_pPut = SPUDRAWQUEUE_NOPCOUNT_METHOD | ( m_pBufferEnd - m_pPut - 1 );
+		pNewPut = m_pBuffer + nWords;
+		bWrap = true;
+		pAllocation = m_pBuffer;
+	}
+
+	// since this put may be the last, we need to make sure that even after alignment, put != get
+	// so we wait for the space to free up for aligned put
+	uint32 * pNewAlignedPut = ( uint32* )AlignValue( uintp( pNewPut ), DMA_ALIGNMENT );
+
+	if( bWrap ? pOldPut <= m_pFlushWatermark || m_pFlushWatermark < pNewAlignedPut:
+				pOldPut <= m_pFlushWatermark && m_pFlushWatermark < pNewAlignedPut )
+	{
+		// collects , aligns and submits commands to SPU
+		m_fnFlushCallback( this );
+		// m_pPut may have changed slightly for alignment or EndZPass(), so we need to reconsider wrapping and recompute all pointers
+		
+		pOldPut = m_pPut; pAllocation = pOldPut;
+		pNewPut = pOldPut + nWords;
+		bWrap = false;
+
+		if( pNewPut > m_pBufferEnd ) // do we need to wrap?
+		{
+			//we have to wrap...
+			if( m_pPut < m_pBufferEnd )
+				*m_pPut = SPUDRAWQUEUE_NOPCOUNT_METHOD | ( m_pBufferEnd - m_pPut - 1 );
+			pNewPut = m_pBuffer + nWords;
+			bWrap = true;
+			pAllocation = m_pBuffer;
+		}
+
+		// since this put may be the last, we need to make sure that even after alignment, put != get
+		// so we wait for the space to free up for aligned put
+		pNewAlignedPut = ( uint32* )AlignValue( uintp( pNewPut ), DMA_ALIGNMENT );
+	}
+
+	// we must not allow new put == get, because it will cause the whole ring to suddenly be marked as empty
+	uint nSpins = 0;
+	while( bWrap ? pOldPut < m_pGet || m_pGet <= pNewAlignedPut : pOldPut < m_pGet && m_pGet <= pNewAlignedPut )
+	{
+		if( nSpins++ > 2 )
+		{
+			m_fnStallCallback( this, m_pGet, nWords );
+		}
+
+		SetSignal( *m_pSignal );
+	}
+	
+	Assert( pNewPut >= m_pBuffer && pNewPut <= m_pBufferEnd );
+	Assert( pAllocation >= m_pBuffer && pAllocation <= m_pBufferEnd );
+	Assert( pAllocation + nWords >= m_pBuffer && pAllocation + nWords <= m_pBufferEnd );
+
+	m_pPut = pNewPut; // we don't need to use up the whole aligned buffer
+
+#ifdef _DEBUG	
+	if( pAllocation == m_nAllocBreakAddress )
+		DebuggerBreak();
+#endif
+	m_nAllocWords += nWords;
+	return pAllocation;
+}
+
+
+// This is called within the Flush callback. May change m_pPut
+// returns the number of bytes written from UNaligned start to UNaligned end
+uint SpuDrawQueue::Collect( uint32 * pStartBatch, uint32 * pEndBatch, CDmaListConstructor & dmac )
+{
+#ifdef _DEBUG
+	CDmaListConstructor saveDmac = dmac;(void)saveDmac;
+	m_nCollectCount++;
+	Assert( m_nCollectCount != m_nCollectBreak );
+#endif
+	Assert( pStartBatch >= m_pBuffer && pStartBatch <= m_pBufferEnd && pEndBatch >= m_pBuffer && pEndBatch <= m_pBufferEnd );
+	uint nSize = 0;
+	if( pEndBatch != pStartBatch ) //		or else it's an empty transaction, nothing to upload
+	{
+		// align the put pointer for DMA - always safe because SPUs can't be processing the remainder of 16-byte block
+		// while we're writing into its beginning.
+		// while( uintp( pEndBatch ) & ( DMA_ALIGNMENT - 1 ) )
+		// {
+		// 	*( pEndBatch++ ) = 0;
+		// }		
+
+		if( pEndBatch > pStartBatch )
+		{
+			// it wraps
+			dmac.AddInputDmaLargeUnalignedRegion( pStartBatch, pEndBatch );
+			nSize += uintp( pEndBatch ) - uintp( pStartBatch );
+		}
+		else
+		{
+			if( pStartBatch != m_pBufferEnd )
+			{
+				dmac.AddInputDmaLargeUnalignedRegion( pStartBatch, m_pBufferEnd );
+				nSize += uintp( m_pBufferEnd ) - uintp( pStartBatch );
+			}
+			dmac.AddInputDmaLargeUnalignedRegion( m_pBuffer, pEndBatch );
+			nSize += uintp( pEndBatch ) - uintp( m_pBuffer );
+		}
+	}
+	
+	SetFlushWatermarkFrom( pEndBatch );
+	
+	return nSize;
+}
+
+void SpuDrawQueue::SetFlushWatermarkFrom( uint32 *pPut )
+{
+	m_pFlushWatermark = ( uint32* )( ( uintp( pPut ) + DRAWQUEUE_LSRING_SIZE ) & -16 );
+	while( m_pFlushWatermark >= m_pBufferEnd )
+	{
+		m_pFlushWatermark -= m_pBufferEnd - m_pBuffer;
+	}
+}
+
+uint SpuDrawQueue::Length( uint32 * pBegin, uint32 * pEnd )const
+{
+	Assert( IsValidCursor( pBegin ) && IsValidCursor( pEnd ) );
+	if( pBegin < pEnd )
+		return uintp( pEnd ) - uintp( pBegin );
+	else
+		return ( uintp( m_pBufferEnd ) - uintp( pBegin ) ) +
+			   ( uintp( pEnd ) - uintp( m_pBuffer ) );
+}
--- a/materialsystem/ps3gcm/spudrawqueue.h
+++ b/materialsystem/ps3gcm/spudrawqueue.h
@@ -0,0 +1,132 @@
+//========== Copyright <20> Valve Corporation, All rights reserved. ========
+//
+//   This is PPU->SPU fifo queue to feed draw jobs
+//
+
+#ifndef SPUDRAWQUEUE_HDR
+#define SPUDRAWQUEUE_HDR
+
+#include "tier0/dbg.h"
+#include "tier1/strtools.h"
+#include "vjobs/pcring.h"
+#include "ps3/vjobutils_shared.h"
+#include "vjobs/spudrawqueue_shared.h"
+
+extern void StallAndWarning( const char * pWarning );
+
+class SpuDrawQueue
+{
+public:
+	typedef void ( *FnFlushCallback_t)( SpuDrawQueue * );
+	typedef void ( *FnStallCallback_t)( SpuDrawQueue *, uint32 * pGet, uint nWords );
+
+	void Init( uint nBufferSize, uint32 * pSignal, FnFlushCallback_t fnFlushCallback, FnStallCallback_t fnStallCallback );
+	void Shutdown();
+	
+	void PushFlushCallback( FnFlushCallback_t fnFlushCallback );
+	void PopFlushCallback();
+					     
+	uint32 *AllocWords( uint nWords /*, uint nAlignMask = 0, uint nAlignValue = 0*/ );
+	void UnallocToAlign();
+
+	template<typename T>
+	T *AllocAligned( )
+	{
+		COMPILE_TIME_ASSERT( sizeof( T ) % 4 == 0 );
+		Align();
+		return ( T* )AllocWords( sizeof( T ) / 4 );
+	}
+	
+	template <typename T>
+	T *AllocWithHeader( uint nHeader ) { uint32 * pHeader = AllocWords( 1 + sizeof( T ) / 4 ); *pHeader = nHeader; return ( T* )( pHeader + 1 ); }
+	
+	uint Collect( uint32 * pStartBatch, uint32 * pEndBatch, CDmaListConstructor & dmac );
+	uint32 * GetCursor(){ return m_pPut; }
+	uint32 * GetFlushWatermark() {return m_pFlushWatermark;}
+	void Align();
+	
+	void Push4( uint32 a, uint32 b, uint32 c, uint32 d ){ uint32 * p = AllocWords( 4 ); p[0] = a; p[1] = b; p[2] = c; p[3] = d; }
+	void Push3( uint32 a, uint32 b, uint32 c ){ uint32 * p = AllocWords( 3 ); p[0] = a; p[1] = b; p[2] = c; }
+	void Push2( uint32 a, uint32 b ){ uint32 * p = AllocWords( 2 ); p[0] = a; p[1] = b; }
+	void Push1( uint32 a ){ uint32 * p = AllocWords( 1 ); p[0] = a; }
+
+	enum ConstEnum_t {DMA_ALIGNMENT = 16 };
+	
+	void SetFlushWatermarkFrom( uint32 *pPut );
+	
+	uint32 GetSignal()const{ return ( uint32 )m_pPut; }
+	uint32 * GetBuffer()const{ return m_pBuffer; }
+	uint32 * GetBufferEnd()const { return m_pBufferEnd; }
+	uint32 GetBufferWords()const { return m_pBufferEnd - m_pBuffer; }
+	bool IsValidCursor( uint32 * p )const { return m_pBuffer <= p && p <= m_pBufferEnd && 0 == ( uintp( p ) & 3 ); }
+	uint32 * NormalizeCursor( uint32 * p ) { Assert( IsValidCursor( p ) ); return ( p >= m_pBufferEnd ? m_pBuffer : p ); }
+	uint Length( uint32 * pBegin, uint32 * pEnd )const;
+protected:
+	void SetSignal( uint32 nSignal );
+public:
+	uint64 m_nAllocWords;
+	#ifdef _DEBUG
+	uint64 m_nAllocCount, m_nCollectCount;
+	uint64 m_nAllocBreak, m_nCollectBreak;
+	uint32 * m_nAllocBreakAddress;
+	#endif
+protected:
+	// the begin and end of the whole buffer
+	// it must be 16-byte aligned
+	uint32 *m_pBuffer, *m_pBufferEnd;
+	
+	// up to this point, we may write stuff. Starting at this point, SPU is reading data
+	// m_pPut==m_pGet means "buffer empty"
+	// m_pPut > m_pGet means we can write to the end of the buffer and then start at the start
+	// m_pPut < m_pGet means we can write from put to get, exclusively
+	uint32 *m_pGet;
+	
+	// this is the point where we can write stuff, up to m_pGet
+	uint32 *m_pPut;
+	
+	// external signal in the structure where SPU writes
+	volatile uint32 * m_pSignal;
+	
+	uint32 *m_pFlushWatermark;
+	
+	// FlushCallback member is implemented elsewhere. DrawQueue calls this callback
+	// as an advice to flush the queue. The callback doesn't have to flush the queue
+	// if the current transaction is deemed atomic. Also, even if the queue is flushed,
+	// this object does not get immediate feedback until it reads the signal that SPU sets
+	// much later, asynchronously. This callback is important to slice the long transactions 
+	// into smaller chunks that fit into LS
+	FnFlushCallback_t m_fnFlushCallback;
+	FnStallCallback_t m_fnStallCallback;
+
+	//enum EnumConst_t{STACK_SIZE = 1 };
+	FnFlushCallback_t m_fnFlushCallbackStack;
+};				   
+
+
+inline void SpuDrawQueue::SetSignal( uint32 nSignal )
+{
+	uint32 *pNewGet = (uint32*)nSignal;
+	
+	// the new get must be between old get and put
+	
+	Assert( pNewGet == m_pGet ||
+			( pNewGet > m_pGet  ? m_pPut < m_pGet || pNewGet <= m_pPut  // the new get doesn't wrap around the buffer, 
+								: m_pPut < m_pGet && pNewGet <= m_pPut // the new get wraps around the buffer, so the put must wrap around, too
+			)
+	);
+	
+	m_pGet = pNewGet;
+}
+
+
+
+inline void SpuDrawQueue::Align()
+{
+	while( uintp( m_pPut ) & 0xF )
+	{
+		Push1( 0 );
+	}
+}
+
+
+#endif
--- a/materialsystem/ps3gcm/spugcm.cpp
+++ b/materialsystem/ps3gcm/spugcm.cpp
--- a/materialsystem/ps3gcm/spugcm.h
+++ b/materialsystem/ps3gcm/spugcm.h
@@ -0,0 +1,257 @@
+//========== Copyright <20> Valve Corporation, All rights reserved. ========
+// This is the central hub for controlling SPU activities relating to 
+// RSX/graphics processing/rendering
+//
+#ifndef SPU_GCM_HDR
+#define SPU_GCM_HDR
+
+#include "ps3/spugcm_shared.h"
+//#include "ps3/rsx_spu_double_ring.h"
+#include "vjobs_interface.h"
+#include "ps3/vjobchain.h"
+#include "ps3/vjobpool.h"
+#include "ps3/ps3gcmmemory.h"
+#include "spudrawqueue.h"
+#include "gcmfunc.h"
+#include <edge/post/edgePost_ppu.h>
+#include <edge/post/edgepost_mlaa_handler_ppu.h>
+
+extern CSpuGcmSharedState g_spuGcmShared;
+extern void StallAndWarning( const char * pWarning );
+
+
+class ZPass
+{
+public:
+	void Init();
+	bool CanBegin();
+	void Begin( uint32 * pCursor );
+	void End() { m_pCursor = NULL; }
+	void Shutdown();
+	void Validate()const{Assert( !m_nDummy && ( m_nPut - m_nGet ) <= m_nJobs );}
+	uint GetSubchainCapacity()const { Validate();  return m_nJobs - ( m_nPut - m_nGet ) ; }
+	uint64 * GetCurrentCommandPtr() { return &m_pJobs[ m_nPut & ( m_nJobs - 1 ) ]; }
+	void PushCommand( uint64 nCommand );
+	operator bool () const { return m_pCursor != NULL; }
+public:
+	uint m_nDrawPassSubchain;
+	uint m_nJobPoolMarker;
+	uint m_nJobs;
+	uint m_nDummy;
+	uint m_nPut;
+	uint m_isInEndZPass;
+
+	ZPassSavedState_t  * m_pSavedState;
+	uint32 * m_pCursor;	
+	uint64 * m_pSubchain;
+	uint64 * m_pJobs; // this ring buffer contains recorded rendering jobs to be replayed
+	uint m_nFpcpStateEndOfJournalIdxAtZPassBegin; // ... at the beginning of Zpass
+
+	// Notice: this m_pGet member is patched by SPU after a corresponding job subchain is finished
+	volatile uint32 m_nGet;
+protected:
+};
+
+
+
+
+class ALIGN128 CEdgePostWorkload
+{
+public:
+	CEdgePostWorkload(){m_isInitialized = false;}
+	void OnVjobsInit( VJobsRoot* pRoot );
+	void OnVjobsShutdown( VJobsRoot* pRoot );
+	void Kick( void * dst, uint nSetLabel );
+	bool ShouldUseLabelForSynchronization()const{return true;}
+	bool IsResultInMainMemory()const { return true; }
+
+	enum EnumConst_t{STAGE_COUNT=1};
+	EdgePostProcessStage m_stages[STAGE_COUNT];
+	EdgePostMlaaContext m_mlaaContext;
+	EdgePostWorkload m_workload;
+	void * m_pMlaaScratch;
+	
+	bool m_isInitialized;
+
+} ALIGN128_POST;
+extern CEdgePostWorkload g_edgePostWorkload;
+
+
+class CSpuGcm: public VJobInstance
+{
+public:
+	void CreateRsxBuffers();
+	void CreateIoBuffers();
+	void UseIoBufferSlack( uint nIoBufferSlack );
+	void OnGcmInit();
+	void Shutdown();
+	
+	void BeginScene();
+	void EndScene();
+	
+	void CmdBufferFlush( )
+	{
+		GcmStateFlush();
+		//PutPcbringCtx();
+	}
+	
+	void CmdBufferFinish();
+
+	int OnGcmCommandBufferReserveCallback( struct CellGcmContextData *context, uint32_t nCount );
+	int OnGcmCommandBufferReserveCallbackOld( struct CellGcmContextData *context, uint32_t nCount );
+	
+	void GcmStateFlush( );
+	SpuDrawHeader_t * BeginDrawBatch();
+	void SubmitDrawBatch( IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup );
+	
+	bool TruePause();
+	void RenderEmptyFrame();
+	
+	void SyncMlaa( void * pLocalSurface );
+	void SyncMlaa( ) { SyncMlaa( m_pMlaaBuffer ); }
+
+	bool BeginZPass( );
+	void SetPredication( uint nPredicationMask ); // D3DPRED_* mask
+	void EndZPass( bool bPopMarker );
+	void AbortZPass(){ EndZPass( false ); }
+	void OnSetPixelShaderConstant();
+	
+	SpuDrawQueue * GetDrawQueue(){ return &m_spuDrawQueues[m_nSpuDrawQueueSelector];}
+	SpuDrawQueue * GetDrawQueueNormal(){ return &m_spuDrawQueues[0]; }
+	void DrawQueueNormal( bool bExecuteDeferredQueueSegment = true );
+	struct DrawQueueDeferred_Result{ bool isFirstInFrame; };
+	DrawQueueDeferred_Result DrawQueueDeferred(); // may flush previous frame deferred queue
+	uint IsDeferredDrawQueue() { return m_nSpuDrawQueueSelector; }
+	bool ExecuteDeferredDrawQueue( uint nPrevious );
+	void FlipDeferredDrawQueue();
+	bool ExecuteDeferredDrawQueueSegment( uint32 * pCmdBegin, uint32 * pCmdEnd, bool bExecuteDraws );
+	void ValidateDeferredQueue();
+	//void DisableMlaaForTwoFrames();
+	void DisableMlaaPermanently();
+	void DisableMlaa();
+protected:
+	static void OnSpuDrawQueueStallDeferredDelegator( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords );
+	void OnSpuDrawQueueStallDeferred( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords );
+	static void OnSpuDrawQueueFlushDeferred( SpuDrawQueue *pDrawQueue );
+	static void OnSpuDrawQueueStall( SpuDrawQueue *pDrawQueue, uint32 * pGet, uint nWords );
+	static void OnSpuDrawQueueFlush( SpuDrawQueue *pDrawQueue );
+	static void OnSpuDrawQueueFlushDoNothing( SpuDrawQueue *pDrawQueue ){}
+	static void OnSpuDrawQueueFlushInZPass( SpuDrawQueue *pDrawQueue );
+	void OnSpuDrawQueueFlushInZPass( );
+	void OnVjobsInit(); // gets called after m_pRoot was created and assigned
+	void TestPriorities();
+	void OnVjobsShutdown(); // gets called before m_pRoot is about to be destructed and NULL'ed
+	
+	uint32 * GetPcbringPtr( uint nOffsetBytes ) { return AddBytes( m_pPcbringBuffer, nOffsetBytes & ( g_spuGcmShared.m_nPcbringSize - 1 ) ); }
+	uint32 * GetPcbringBufferEnd() {return AddBytes( m_pPcbringBuffer, g_spuGcmShared.m_nPcbringSize ); }
+	signed int GetPcbringAvailableBytes()const;
+	//void SetCtxBuffer( uint nSegment );
+	#if 0
+	volatile uint64* PutPcbringCtx( uint32 * pSkipTo, uint32 * pNewEnd );
+	volatile uint64* PutPcbringCtx();
+	#endif
+	inline uint GetMaxPcbringSegmentBytes()const { return m_nMaxPcbringSegmentBytes; }
+	void BeginGcmStateTransaction();
+	void PushSpuGcmJob( CellSpursJob128 * pJob );
+	void PushStateFlushJob( SpuDrawQueue * pDrawQueue, uint nResultantSpuDrawQueueSignal, uint32 *pCursorBegin, uint32 * pCursorEnd );
+	void PushSpuGcmJobCommand( uint64 nCommand );
+	void PushSpuGcmCallSubchain( uint64 * eaJobChain ){ m_jobSink.Push( CELL_SPURS_JOB_COMMAND_CALL( eaJobChain ) );}
+	void ZPassCheckpoint( uint nReserveSlots );
+	CellSpursJob128 * PushDrawBatchJob( uint nResultantSpuDrawQueueSignal, SpuDrawHeader_t * pDrawHeader, IDirect3DVertexDeclaration9 *pVertDecl, OptimizedModel::OptimizedIndexBufferMarkupPs3_t *pIbMarkup );
+public:
+
+	void CloseDeferredChunk();
+	uint32* OpenDeferredChunk( uint nHeader = SPUDRAWQUEUE_DEFERRED_GCMFLUSH_METHOD, uint nAllocExtra = 0 );
+
+	void SetCurrentBatchCursor( uint32 * pCursor )
+	{
+		m_pCurrentBatchCursor[m_nSpuDrawQueueSelector] = pCursor;
+	}
+	
+	uint32 * GetCurrentBatchCursor() 
+	{
+		return m_pCurrentBatchCursor[m_nSpuDrawQueueSelector];
+	}
+	
+protected:
+	SpuDrawQueue m_spuDrawQueues[2];
+	
+	// this frame [0]  and previous frames [1] "end" markers for replay
+	// gets updated on every chunk close
+	uint32* m_pDeferredQueueCursors[3]; 
+	
+	// this is the last point where DrawQueueDeferred() was called
+	uint32 * m_pDeferredQueueSegment;
+	
+	// pointer to deferred chunk last open; NULL if the last deferred chunk was closed, but none new was open yet
+	// this may stay non-NULL( thus indicating non-closed chunk) during executing deferred commands, too,
+	// in case of out-of-memory condition. Then, StallDeferred callback will execute deferred commands without closing current chunk.
+	// Relation: MANY chunks per ONE batch
+	uint32* m_pDeferredChunkHead;
+	
+	uint32 m_nDeferredChunkHead;
+	uint32 *m_pDeferredChunkSubmittedTill[4]; // only [1] is used; [0] and [2] are write- and debug-only
+	uint16 m_nSpuDrawQueueSelector;
+	uint16 m_nFramesToDisableDeferredQueue; // disable for this number of frames if we don't have enough memory
+
+public:
+	// fragment program constant patcher double ring, JTS->RET , RSX->SPU
+	CPs3gcmLocalMemoryBlock m_fpcpRingBuffer, m_edgeGeomRingBuffer;
+	VjobChain3 m_jobSink;
+	VjobPool<CellSpursJob128> m_jobPool128;
+
+	volatile uint32 * m_pFinishLabel;
+	
+	uint32 *m_pPcbringBuffer;
+	
+	ZPass m_zPass; // NULL when we aren't in Zpass
+	
+	DeferredState_t * m_pDeferredStates[2];
+	
+	uint m_nPcbringBegin; // this byte offset corresponds to GCM_CTX->begin
+ 	uint32 m_nPcbringWaitSpins;
+ 	uint32 m_nMaxPcbringSegmentBytes;
+ 	uint32 m_nGcmFlushJobScratchSize;
+ 	
+ 	uintp m_eaLastJobThatUpdatesSharedState;
+ 	uint m_nFpcpStateEndOfJournalIdxAtSpuGcmJob;
+ 	
+ 	enum TransactionBatchEnum_t
+ 	{
+ 		BATCH_GCMSTATE, // the default transaction type
+ 		BATCH_DRAW
+ 	};
+ 	TransactionBatchEnum_t m_nCurrentBatch;
+ 	// the batch is a batch of commands to send to an SPU job: job_gcmflush (BATCH_GCMSTATE) or job_drawindexedprimitive (BATCH_DRAW)
+ 	uint32 * m_pCurrentBatchCursor[2];
+ 	
+ 	void * m_pMlaaBuffer, *m_pMlaaBufferOut;
+ 	volatile vec_uint4 * m_pMlaaBufferCookie;
+ 	uint32 *m_pEdgePostRsxLock;
+
+	uint m_nFrame;
+	
+	#ifdef _DEBUG
+	uint m_nJobsPushed, m_nChunksClosedInSegment;
+	#endif
+	uint64 m_nDeferredQueueWords;
+
+	bool m_bUseDeferredDrawQueue;
+};
+extern CSpuGcm g_spuGcm;
+extern const vec_uint4 g_vuSpuGcmCookie;
+
+
+struct ALIGN128 PriorityTest_t
+{
+	CellSpursJob128 m_job;
+	job_notify::NotifyArea_t m_notify;
+	bool Test( class VjobChain4 *pJobChain );
+} ALIGN128_POST;
+
+
+
+
+
+
+#endif
				`@@ -0,0 +1 @@`
				`sce-cgc --mnvb -p sce_fp_rsx -o shader_ps_empty.bin shader_ps_empty.cg`