I felt that we were lacking certain kinds of sub string counting, like unsafe byte-by-byte comparisons. I put together the original poster's method and any methods I could think of.
These are the string extensions I made.
namespace Example { using System; using System.Text; public static class StringExtensions { public static int CountSubstr(this string str, string substr) { return (str.Length - str.Replace(substr, "").Length) / substr.Length; } public static int CountSubstr(this string str, char substr) { return (str.Length - str.Replace(substr.ToString(), "").Length); } public static int CountSubstr2(this string str, string substr) { int substrlen = substr.Length; int lastIndex = str.IndexOf(substr, 0, StringComparison.Ordinal); int count = 0; while (lastIndex != -1) { ++count; lastIndex = str.IndexOf(substr, lastIndex + substrlen, StringComparison.Ordinal); } return count; } public static int CountSubstr2(this string str, char substr) { int lastIndex = str.IndexOf(substr, 0); int count = 0; while (lastIndex != -1) { ++count; lastIndex = str.IndexOf(substr, lastIndex + 1); } return count; } public static int CountChar(this string str, char substr) { int length = str.Length; int count = 0; for (int i = 0; i < length; ++i) if (str[i] == substr) ++count; return count; } public static int CountChar2(this string str, char substr) { int count = 0; foreach (var c in str) if (c == substr) ++count; return count; } public static unsafe int CountChar3(this string str, char substr) { int length = str.Length; int count = 0; fixed (char* chars = str) { for (int i = 0; i < length; ++i) if (*(chars + i) == substr) ++count; } return count; } public static unsafe int CountChar4(this string str, char substr) { int length = str.Length; int count = 0; fixed (char* chars = str) { for (int i = length - 1; i >= 0; --i) if (*(chars + i) == substr) ++count; } return count; } public static unsafe int CountSubstr3(this string str, string substr) { int length = str.Length; int substrlen = substr.Length; int count = 0; fixed (char* strc = str) { fixed (char* substrc = substr) { int n = 0; for (int i = 0; i < length; ++i) { if (*(strc + i) == *(substrc + n)) { ++n; if (n == substrlen) { ++count; n = 0; } } else n = 0; } } } return count; } public static int CountSubstr3(this string str, char substr) { return CountSubstr3(str, substr.ToString()); } public static unsafe int CountSubstr4(this string str, string substr) { int length = str.Length; int substrLastIndex = substr.Length - 1; int count = 0; fixed (char* strc = str) { fixed (char* substrc = substr) { int n = substrLastIndex; for (int i = length - 1; i >= 0; --i) { if (*(strc + i) == *(substrc + n)) { if (--n == -1) { ++count; n = substrLastIndex; } } else n = substrLastIndex; } } } return count; } public static int CountSubstr4(this string str, char substr) { return CountSubstr4(str, substr.ToString()); } } }
Followed by the test code...
static void Main() { const char matchA = '_'; const string matchB = "and"; const string matchC = "muchlongerword"; const string testStrA = "_and_d_e_banna_i_o___pfasd__and_d_e_banna_i_o___pfasd_"; const string testStrB = "and sdf and ans andeians andano ip and and sdf and ans andeians andano ip and"; const string testStrC = "muchlongerword amuchlongerworsdfmuchlongerwordsdf jmuchlongerworijv muchlongerword sdmuchlongerword dsmuchlongerword"; const int testSize = 1000000; Console.WriteLine(testStrA.CountSubstr('_')); Console.WriteLine(testStrA.CountSubstr2('_')); Console.WriteLine(testStrA.CountSubstr3('_')); Console.WriteLine(testStrA.CountSubstr4('_')); Console.WriteLine(testStrA.CountChar('_')); Console.WriteLine(testStrA.CountChar2('_')); Console.WriteLine(testStrA.CountChar3('_')); Console.WriteLine(testStrA.CountChar4('_')); Console.WriteLine(testStrB.CountSubstr("and")); Console.WriteLine(testStrB.CountSubstr2("and")); Console.WriteLine(testStrB.CountSubstr3("and")); Console.WriteLine(testStrB.CountSubstr4("and")); Console.WriteLine(testStrC.CountSubstr("muchlongerword")); Console.WriteLine(testStrC.CountSubstr2("muchlongerword")); Console.WriteLine(testStrC.CountSubstr3("muchlongerword")); Console.WriteLine(testStrC.CountSubstr4("muchlongerword")); var timer = new Stopwatch(); timer.Start(); for (int i = 0; i < testSize; ++i) testStrA.CountSubstr(matchA); timer.Stop(); Console.WriteLine("CS1 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrB.CountSubstr(matchB); timer.Stop(); Console.WriteLine("CS1 and: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrC.CountSubstr(matchC); timer.Stop(); Console.WriteLine("CS1 mlw: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrA.CountSubstr2(matchA); timer.Stop(); Console.WriteLine("CS2 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrB.CountSubstr2(matchB); timer.Stop(); Console.WriteLine("CS2 and: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrC.CountSubstr2(matchC); timer.Stop(); Console.WriteLine("CS2 mlw: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrA.CountSubstr3(matchA); timer.Stop(); Console.WriteLine("CS3 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrB.CountSubstr3(matchB); timer.Stop(); Console.WriteLine("CS3 and: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrC.CountSubstr3(matchC); timer.Stop(); Console.WriteLine("CS3 mlw: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrA.CountSubstr4(matchA); timer.Stop(); Console.WriteLine("CS4 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrB.CountSubstr4(matchB); timer.Stop(); Console.WriteLine("CS4 and: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrC.CountSubstr4(matchC); timer.Stop(); Console.WriteLine("CS4 mlw: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrA.CountChar(matchA); timer.Stop(); Console.WriteLine("CC1 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrA.CountChar2(matchA); timer.Stop(); Console.WriteLine("CC2 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrA.CountChar3(matchA); timer.Stop(); Console.WriteLine("CC3 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); timer.Restart(); for (int i = 0; i < testSize; ++i) testStrA.CountChar4(matchA); timer.Stop(); Console.WriteLine("CC4 chr: " + timer.Elapsed.TotalMilliseconds + "ms"); }
Results: CSX corresponds with CountSubstrX and CCX corresponds with CountCharX. "chr" searches a string for '_', "and" searches a string for "and", and "mlw" searches a string for "muchlongerword"
CS1 chr: 824.123ms CS1 and: 586.1893ms CS1 mlw: 486.5414ms CS2 chr: 127.8941ms CS2 and: 806.3918ms CS2 mlw: 497.318ms CS3 chr: 201.8896ms CS3 and: 124.0675ms CS3 mlw: 212.8341ms CS4 chr: 81.5183ms CS4 and: 92.0615ms CS4 mlw: 116.2197ms CC1 chr: 66.4078ms CC2 chr: 64.0161ms CC3 chr: 65.9013ms CC4 chr: 65.8206ms
And finally, I had a file with 3.6 million characters. It was "derp adfderdserp dfaerpderp deasderp" repeated 100,000 times. I searched for "derp" inside the file with the above methods 100 times these results.
CS1Derp: 1501.3444ms CS2Derp: 1585.797ms CS3Derp: 376.0937ms CS4Derp: 271.1663ms
So my 4th method is definitely the winner, but, realistically, if a 3.6 million character file 100 times only took 1586ms as the worse case, then all of this is quite negligible.
By the way, I also scanned for the 'd' char in the 3.6 million character file with 100 times CountSubstr and CountChar methods. Results...
CS1 d : 2606.9513ms CS2 d : 339.7942ms CS3 d : 960.281ms CS4 d : 233.3442ms CC1 d : 302.4122ms CC2 d : 280.7719ms CC3 d : 299.1125ms CC4 d : 292.9365ms
The original posters method is very bad for single character needles in a large haystack according to this.
Note: All values were updated to Release version output. I accidentally forgot to build on Release mode upon the first time I posted this. Some of my statements have been amended.