I need to download about 2 million files from the SEC website. each file has a unique URL and averages 10 KB. this is my current implementation:
List<string> urls = new List<string>(); // ... initialize urls ... WebBrowser browser = new WebBrowser(); foreach (string url in urls) { browser.Navigate(url); while (browser.ReadyState != WebBrowserReadyState.Complete) Application.DoEvents(); StreamReader sr = new StreamReader(browser.DocumentStream); StreamWriter sw = new StreamWriter(), url.Substring(url.LastIndexOf('/'))); sw.Write(sr.ReadToEnd()); sr.Close(); sw.Close(); }
the predicted time is about 12 days ... is there a faster way?
Edit: btw, processing a local file takes only 7% of the time
Edit: This is my final implementation:
void Main(void) { ServicePointManager.DefaultConnectionLimit = 10000; List<string> urls = new List<string>(); // ... initialize urls ... int retries = urls.AsParallel().WithDegreeOfParallelism(8).Sum(arg => downloadFile(arg)); } public int downloadFile(string url) { int retries = 0; retry: try { HttpWebRequest webrequest = (HttpWebRequest)WebRequest.Create(url); webrequest.Timeout = 10000; webrequest.ReadWriteTimeout = 10000; webrequest.Proxy = null; webrequest.KeepAlive = false; webresponse = (HttpWebResponse)webrequest.GetResponse(); using (Stream sr = webrequest.GetResponse().GetResponseStream()) using (FileStream sw = File.Create(url.Substring(url.LastIndexOf('/')))) { sr.CopyTo(sw); } } catch (Exception ee) { if (ee.Message != "The remote server returned an error: (404) Not Found." && ee.Message != "The remote server returned an error: (403) Forbidden.") { if (ee.Message.StartsWith("The operation has timed out") || ee.Message == "Unable to connect to the remote server" || ee.Message.StartsWith("The request was aborted: ") || ee.Message.StartsWith("Unable to read data from the transport connection: ") || ee.Message == "The remote server returned an error: (408) Request Timeout.") retries++; else MessageBox.Show(ee.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error); goto retry; } } return retries; }
source share