I am trying to work with FileStream.Seek to quickly jump to a line and read it.
However, I am not getting the right results. I tried to look at this for a while and cannot understand what I'm doing wrong.
Environment:
OS: Windows 7
Framework: .NET 4.0
IDE: Visual C # Express 2010
Examples of data in the directory: C: \ Temp \ Temp.txt
0001 | 100! 2500
0002 | 100! 2500
0003 | 100! 2500
0004 | 100! 2500
0005 | 100! 2500
0006 | 100! 2500
0007 | 100! 2500
0008 | 100! 2500
0009 | 100! 2500
0010 | 100! 2500
Code:
class PaddedFileSearch { private int LineLength { get; set; } private string FileName { get; set; } public PaddedFileSearch() { FileName = @"C:\Temp\Temp.txt";
The result that I get:
File Line length: 15
Line No Position Line
------- -------- -----------------
3 30 0003 | 100! 2500
4 15 0004 | 100! 2500
5 45 0005 | 100! 2500
Line No Position Line
------- -------- -----------------
3 30 0003 | 100! 2500
5 30 0004 | 100! 2500
My problem is with the following output:
Line No Position Line
------- -------- -----------------
5 30 0004 | 100! 2500
The output for the line should be: 0005 | 100! 2500
I do not understand why this is happening.
Am I doing something wrong? Is there a workaround? Also, are there any faster ways to do this using something like search?
(I'm looking for code-based options and NOT Oracle or SQL Server. For the argument, we can also say that the file size is 1 GB.)
Any help is greatly appreciated.
Thanks.
UPDATE:
I found 4 great answers here. Thank you very much.
Time example:
Based on several runs, the following methods are from best to good. Even the good is very close to the best.
In a file that contains 10K lines, 2.28 MB. I searched for the same 5,000 random strings using all the options.
- Seek4: Elapsed time: 00: 00: 00.0398530 ms - Ritch Melton
- Seek3: Elapsed time: 00: 00: 00.0446072 ms - Valentin Kuzub
- Seek1: Elapsed Time: 00: 00: 00.0538210 ms - Jake
- Seek2: Elapsed time: 00: 00: 00.0889589 ms - bitxwise
Below is the code. After saving the code, you can simply call it by typing TestPaddedFileSeek.CallPaddedFileSeek(); . You will also need to specify a namespace and "use of links."
`
/// <summary> /// This class multiple options of reading a by line number in a padded file (all lines are the same length). /// The idea is to quick jump to the file. /// Details about the discussions is available at: http://stackoverflow.com/questions/5201414/having-a-problem-while-using-filestream-seek-in-c-solved /// </summary> class PaddedFileSeek { public FileInfo File {get; private set;} public int LineLength { get; private set; } #region Private methods private static int FindLineLength(FileInfo fileInfo) { using (StreamReader reader = new StreamReader(fileInfo.FullName)) { string line; if ((line = reader.ReadLine()) != null) { int length = line.Length + 2; // The 2 is for NewLine(\r\n) return length; } } return 0; } private static void PrintHeader() { /* Debug.Print(""); Debug.Print("Line No\t\tLine"); Debug.Print("-------\t\t--------------------------"); */ } private static void PrintLine(int lineNo, string line) { //Debug.Print("{0}\t\t\t{1}", lineNo, line); } private static void PrintElapsedTime(TimeSpan elapsed) { Debug.WriteLine("Time elapsed: {0} ms", elapsed); } #endregion public PaddedFileSeek(FileInfo fileInfo) { // Possibly might have to check for FileExists int length = FindLineLength(fileInfo); //if (length == 0) throw new PaddedProgramException(); LineLength = length; File = fileInfo; } public void CallAll(int[] lineNoArray, List<int> lineNoList) { Stopwatch sw = new Stopwatch(); #region Seek1 // Create new stopwatch sw.Start(); Debug.Write("Seek1: "); // Print Header PrintHeader(); Seek1(lineNoArray); // Stop timing sw.Stop(); // Print Elapsed Time PrintElapsedTime(sw.Elapsed); sw.Reset(); #endregion #region Seek2 // Create new stopwatch sw.Start(); Debug.Write("Seek2: "); // Print Header PrintHeader(); Seek2(lineNoArray); // Stop timing sw.Stop(); // Print Elapsed Time PrintElapsedTime(sw.Elapsed); sw.Reset(); #endregion #region Seek3 // Create new stopwatch sw.Start(); Debug.Write("Seek3: "); // Print Header PrintHeader(); Seek3(lineNoArray); // Stop timing sw.Stop(); // Print Elapsed Time PrintElapsedTime(sw.Elapsed); sw.Reset(); #endregion #region Seek4 // Create new stopwatch sw.Start(); Debug.Write("Seek4: "); // Print Header PrintHeader(); Seek4(lineNoList); // Stop timing sw.Stop(); // Print Elapsed Time PrintElapsedTime(sw.Elapsed); sw.Reset(); #endregion } /// <summary> /// Option by Jake /// </summary> /// <param name="lineNoArray"></param> public void Seek1(int[] lineNoArray) { long position = 0; string line = null; Array.Sort(lineNoArray); using (FileStream fs = new FileStream(File.FullName, FileMode.Open, FileAccess.Read, FileShare.None)) { using (StreamReader reader = new StreamReader(fs)) { foreach (int lineNo in lineNoArray) { position = (lineNo - 1) * LineLength; fs.Seek(position, SeekOrigin.Begin); if ((line = reader.ReadLine()) != null) { PrintLine(lineNo, line); } reader.DiscardBufferedData(); } } } } /// <summary> /// option by bitxwise /// </summary> public void Seek2(int[] lineNoArray) { string line = null; long step = 0; Array.Sort(lineNoArray); using (FileStream fs = new FileStream(File.FullName, FileMode.Open, FileAccess.Read, FileShare.None)) { // using (StreamReader reader = new StreamReader(fs)) // If you put "using" here you will get WRONG results. // I would like to understand why this is. { foreach (int lineNo in lineNoArray) { StreamReader reader = new StreamReader(fs); step = (lineNo - 1) * LineLength - fs.Position; fs.Position += step; if ((line = reader.ReadLine()) != null) { PrintLine(lineNo, line); } } } } } /// <summary> /// Option by Valentin Kuzub /// </summary> /// <param name="lineNoArray"></param> #region Seek3 public void Seek3(int[] lineNoArray) { long position = 0; // totalPosition = 0; string line = null; int oldLineNo = 0; Array.Sort(lineNoArray); using (FileStream fs = new FileStream(File.FullName, FileMode.Open, FileAccess.Read, FileShare.None)) { using (StreamReader reader = new StreamReader(fs)) { foreach (int lineNo in lineNoArray) { position = (lineNo - oldLineNo - 1) * LineLength; fs.Seek(position, SeekOrigin.Current); line = ReadLine(fs, LineLength); PrintLine(lineNo, line); oldLineNo = lineNo; } } } } #region Required Private methods /// <summary> /// Currently only used by Seek3 /// </summary> /// <param name="stream"></param> /// <param name="length"></param> /// <returns></returns> private static string ReadLine(FileStream stream, int length) { byte[] bytes = new byte[length]; stream.Read(bytes, 0, length); return new string(Encoding.UTF8.GetChars(bytes)); } #endregion #endregion /// <summary> /// Option by Ritch Melton /// </summary> /// <param name="lineNoArray"></param> #region Seek4 public void Seek4(List<int> lineNoList) { lineNoList.Sort(); using (var fs = new FileStream(File.FullName, FileMode.Open)) { lineNoList.ForEach(ln => OutputData(fs, ln)); } } #region Required Private methods private void OutputData(FileStream fs, int lineNumber) { var offset = (lineNumber - 1) * LineLength; fs.Seek(offset, SeekOrigin.Begin); var data = new byte[LineLength]; fs.Read(data, 0, LineLength); var text = DecodeData(data); PrintLine(lineNumber, text); } private static string DecodeData(byte[] data) { var encoding = new UTF8Encoding(); return encoding.GetString(data); } #endregion #endregion } static class TestPaddedFileSeek { public static void CallPaddedFileSeek() { const int arrayLenght = 5000; int[] lineNoArray = new int[arrayLenght]; List<int> lineNoList = new List<int>(); Random random = new Random(); int lineNo; string fileName; fileName = @"C:\Temp\Temp.txt"; PaddedFileSeek seeker = new PaddedFileSeek(new FileInfo(fileName)); for (int n = 0; n < 25; n++) { Debug.Print("Loop no: {0}", n + 1); for (int i = 0; i < arrayLenght; i++) { lineNo = random.Next(1, arrayLenght); lineNoArray[i] = lineNo; lineNoList.Add(lineNo); } seeker.CallAll(lineNoArray, lineNoList); lineNoList.Clear(); Debug.Print(""); } } }
`