Contextual migration in CUDA.NET

I am currently using the CUDA.NET library from GASS. I need to initialize cuda arrays (actually cublas vectors, but that doesn't matter) in one processor thread and use them in another processor thread. But a CUDA context that contains all initialized arrays and loaded functions can only be bound to one CPU thread.

There is a mechanism called the context migration API to separate the context from one thread and attach it to another. But I do not know how to use it correctly in CUDA.NET.

I tried something like this:

class Program { private static float[] vector1, vector2; private static CUDA cuda; private static CUBLAS cublas; private static CUdeviceptr ptr; static void Main(string[] args) { cuda = new CUDA(false); cublas = new CUBLAS(cuda); cuda.Init(); cuda.CreateContext(0); AllocateVectors(); cuda.DetachContext(); CUcontext context = cuda.PopCurrentContext(); GetVectorFromDeviceAsync(context); } private static void AllocateVectors() { vector1 = new float[]{1f, 2f, 3f, 4f, 5f}; ptr = cublas.Allocate(vector1.Length, sizeof (float)); cublas.SetVector(vector1, ptr); vector2 = new float[5]; } private static void GetVectorFromDevice(object objContext) { CUcontext localContext = (CUcontext) objContext; cuda.PushCurrentContext(localContext); cuda.AttachContext(localContext); //change vector somehow vector1[0] = -1; //copy changed vector to device cublas.SetVector(vector1, ptr); cublas.GetVector(ptr, vector2); CUDADriver.cuCtxPopCurrent(ref localContext); } private static void GetVectorFromDeviceAsync(CUcontext cUcontext) { Thread thread = new Thread(GetVectorFromDevice); thread.IsBackground = false; thread.Start(cUcontext); } } 

But execution is not executed when trying to copy the modified vector to the device because the context is not connected. Other reasons are unlikely because it works great in single-threaded mode. Any ideas how I can make it work?

+4
source share
2 answers

I still could not find a solution to this problem, but came up with a workaround. The point is to perform all the functions that have something for CUDA in one processor thread. For example, you can do this as follows:

 class Program { private static float[] vector1, vector2; private static CUDA cuda; private static CUBLAS cublas; private static CUdeviceptr ptr; private static readonly AutoResetEvent autoResetEvent = new AutoResetEvent(false); static void Main() { cuda = new CUDA(true); cublas = new CUBLAS(cuda); //allocate vector on cuda device in main thread CudaManager.CallMethod(AllocateVectors); //changing first vector from other thread Thread changeThread = new Thread(ChangeVectorOnDevice_ThreadRun) { IsBackground = false }; changeThread.Start(); //wait for changeThread to finish autoResetEvent.WaitOne(); //getting vector from device in another one thread Thread getThread = new Thread(GetVectorFromDevice_ThreadRun) { IsBackground = false }; getThread.Start(); //wait for getThread to finish autoResetEvent.WaitOne(); Console.WriteLine("({0}, {1}, {2}, {3}, {4})", vector2[0], vector2[1], vector2[2], vector2[3], vector2[4]); Console.ReadKey(true); } private static void AllocateVectors() { vector1 = new[] { 1f, 2f, 3f, 4f, 5f }; vector2 = new float[5]; //allocate memory and copy first vector to device ptr = cublas.Allocate(vector1.Length, sizeof(float)); cublas.SetVector(vector1, ptr); } private static void GetVectorFromDevice() { cublas.GetVector(ptr, vector2); } private static void ChangeVectorOnDevice() { //changing vector and copying it to device vector1 = new[] { -1f, -2f, -3f, -4f, -5f }; cublas.SetVector(vector1, ptr); } private static void ChangeVectorOnDevice_ThreadRun() { CudaManager.CallMethod(ChangeVectorOnDevice); //releasing main thread autoResetEvent.Set(); } private static void GetVectorFromDevice_ThreadRun() { CudaManager.CallMethod(GetVectorFromDevice); //releasing main thread autoResetEvent.Set(); } } public static class CudaManager { public static Action WorkMethod { get; private set; } private static readonly AutoResetEvent actionRecived = new AutoResetEvent(false); private static readonly AutoResetEvent callbackEvent = new AutoResetEvent(false); private static readonly object mutext = new object(); private static bool isCudaThreadRunning; private static void ThreadRun() { //waiting for work method to execute while (actionRecived.WaitOne()) { //invoking recived method WorkMethod.Invoke(); //releasing caller thread callbackEvent.Set(); } } static CudaManager() { Run(); } public static void Run() { if (!isCudaThreadRunning) { Thread thread = new Thread(ThreadRun); thread.IsBackground = true; thread.Start(); isCudaThreadRunning = true; } } public static void CallMethod(Action method) { lock (mutext) { WorkMethod = method; //releasing ThreadRun method actionRecived.Set(); //blocking caller thread untill delegate invokation is complete callbackEvent.WaitOne(); } } } 

I hope he helps someone.

+2
source

Check out the CUDAContextSynchronizer class in the GASS documentation.

+1
source

Source: https://habr.com/ru/post/1307985/


All Articles