I built this code to handle string comparisons between a large number of lines in parallel to go faster.
I used ConcurrentBag so that all threads (tasks) could write to a thread safe collection. Then I upload this collection to a file.
The problem is that the ConcurrentBag<string> log , which I dump into the file, fills up faster than it can write to the file. Thus, my program consumes more and more reels until the memory runs out.
My question is what can I do? Improve logging? Suspend tasks until ConcurrentBag is reset, and then resume tasks? What would be the fastest option?
Here is the code:
CsvWriter csv = new CsvWriter(@"C:\test.csv"); List<Bailleur> bailleurs = DataLoader.LoadBailleurs(); ConcurrentBag<string> log = new ConcurrentBag<string>(); int i = 0; var taskWriteToLog = new Task(() => { // Consume the items in the bag string item; while (true) // (!log.IsEmpty) { if (!log.IsEmpty) { if (log.TryTake(out item)) { csv.WriteLine(item); } else Console.WriteLine("Concurrent Bag busy"); } else { System.Threading.Thread.Sleep(1000); } } }); taskWriteToLog.Start(); Parallel.ForEach(bailleurs, s1 => { foreach (Bailleur s2 in bailleurs) { var lcs2 = LongestCommonSubsequenceExtensions.LongestCommonSubsequence(s1.Name, s2.Name); string line = String.Format("\"LCS\",\"{0}\",\"{1}\",\"{2}\"", s1.Name, s2.Name, lcs2.Item2); log.Add(line); // Console.WriteLine(line); var dic = DiceCoefficientExtensions.DiceCoefficient(s1.Name, s2.Name); line = String.Format("\"DICE\",\"{0}\",\"{1}\",\"{2}\"", s1.Name, s2.Name, dic); log.Add(line); // Console.WriteLine(line); } i++; Console.WriteLine(i); }); public class CsvWriter { public string FilePath { get; set; } private FileStream _fs { get; set; } private StreamWriter _sw { get; set; } public CsvWriter2(string filePath) { FilePath = filePath; _fs = new FileStream(FilePath, FileMode.Create, FileAccess.Write); _sw = new StreamWriter(_fs); } public void WriteLine(string line) { _sw.WriteLine(line); } }
source share