Saya merasa bahwa kami kekurangan jenis penghitungan sub string tertentu, seperti perbandingan byte-by-byte yang tidak aman. Saya mengumpulkan metode poster asli dan metode apa pun yang dapat saya pikirkan.
Ini adalah ekstensi string yang saya buat.
namespace Example
{
using System;
using System.Text;
public static class StringExtensions
{
public static int CountSubstr(this string str, string substr)
{
return (str.Length - str.Replace(substr, "").Length) / substr.Length;
}
public static int CountSubstr(this string str, char substr)
{
return (str.Length - str.Replace(substr.ToString(), "").Length);
}
public static int CountSubstr2(this string str, string substr)
{
int substrlen = substr.Length;
int lastIndex = str.IndexOf(substr, 0, StringComparison.Ordinal);
int count = 0;
while (lastIndex != -1)
{
++count;
lastIndex = str.IndexOf(substr, lastIndex + substrlen, StringComparison.Ordinal);
}
return count;
}
public static int CountSubstr2(this string str, char substr)
{
int lastIndex = str.IndexOf(substr, 0);
int count = 0;
while (lastIndex != -1)
{
++count;
lastIndex = str.IndexOf(substr, lastIndex + 1);
}
return count;
}
public static int CountChar(this string str, char substr)
{
int length = str.Length;
int count = 0;
for (int i = 0; i < length; ++i)
if (str[i] == substr)
++count;
return count;
}
public static int CountChar2(this string str, char substr)
{
int count = 0;
foreach (var c in str)
if (c == substr)
++count;
return count;
}
public static unsafe int CountChar3(this string str, char substr)
{
int length = str.Length;
int count = 0;
fixed (char* chars = str)
{
for (int i = 0; i < length; ++i)
if (*(chars + i) == substr)
++count;
}
return count;
}
public static unsafe int CountChar4(this string str, char substr)
{
int length = str.Length;
int count = 0;
fixed (char* chars = str)
{
for (int i = length - 1; i >= 0; --i)
if (*(chars + i) == substr)
++count;
}
return count;
}
public static unsafe int CountSubstr3(this string str, string substr)
{
int length = str.Length;
int substrlen = substr.Length;
int count = 0;
fixed (char* strc = str)
{
fixed (char* substrc = substr)
{
int n = 0;
for (int i = 0; i < length; ++i)
{
if (*(strc + i) == *(substrc + n))
{
++n;
if (n == substrlen)
{
++count;
n = 0;
}
}
else
n = 0;
}
}
}
return count;
}
public static int CountSubstr3(this string str, char substr)
{
return CountSubstr3(str, substr.ToString());
}
public static unsafe int CountSubstr4(this string str, string substr)
{
int length = str.Length;
int substrLastIndex = substr.Length - 1;
int count = 0;
fixed (char* strc = str)
{
fixed (char* substrc = substr)
{
int n = substrLastIndex;
for (int i = length - 1; i >= 0; --i)
{
if (*(strc + i) == *(substrc + n))
{
if (--n == -1)
{
++count;
n = substrLastIndex;
}
}
else
n = substrLastIndex;
}
}
}
return count;
}
public static int CountSubstr4(this string str, char substr)
{
return CountSubstr4(str, substr.ToString());
}
}
}
Diikuti oleh kode tes ...
static void Main()
{
const char matchA = '_';
const string matchB = "and";
const string matchC = "muchlongerword";
const string testStrA = "_and_d_e_banna_i_o___pfasd__and_d_e_banna_i_o___pfasd_";
const string testStrB = "and sdf and ans andeians andano ip and and sdf and ans andeians andano ip and";
const string testStrC =
"muchlongerword amuchlongerworsdfmuchlongerwordsdf jmuchlongerworijv muchlongerword sdmuchlongerword dsmuchlongerword";
const int testSize = 1000000;
Console.WriteLine(testStrA.CountSubstr('_'));
Console.WriteLine(testStrA.CountSubstr2('_'));
Console.WriteLine(testStrA.CountSubstr3('_'));
Console.WriteLine(testStrA.CountSubstr4('_'));
Console.WriteLine(testStrA.CountChar('_'));
Console.WriteLine(testStrA.CountChar2('_'));
Console.WriteLine(testStrA.CountChar3('_'));
Console.WriteLine(testStrA.CountChar4('_'));
Console.WriteLine(testStrB.CountSubstr("and"));
Console.WriteLine(testStrB.CountSubstr2("and"));
Console.WriteLine(testStrB.CountSubstr3("and"));
Console.WriteLine(testStrB.CountSubstr4("and"));
Console.WriteLine(testStrC.CountSubstr("muchlongerword"));
Console.WriteLine(testStrC.CountSubstr2("muchlongerword"));
Console.WriteLine(testStrC.CountSubstr3("muchlongerword"));
Console.WriteLine(testStrC.CountSubstr4("muchlongerword"));
var timer = new Stopwatch();
timer.Start();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr(matchA);
timer.Stop();
Console.WriteLine("CS1 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr(matchB);
timer.Stop();
Console.WriteLine("CS1 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr(matchC);
timer.Stop();
Console.WriteLine("CS1 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr2(matchA);
timer.Stop();
Console.WriteLine("CS2 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr2(matchB);
timer.Stop();
Console.WriteLine("CS2 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr2(matchC);
timer.Stop();
Console.WriteLine("CS2 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr3(matchA);
timer.Stop();
Console.WriteLine("CS3 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr3(matchB);
timer.Stop();
Console.WriteLine("CS3 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr3(matchC);
timer.Stop();
Console.WriteLine("CS3 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr4(matchA);
timer.Stop();
Console.WriteLine("CS4 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr4(matchB);
timer.Stop();
Console.WriteLine("CS4 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr4(matchC);
timer.Stop();
Console.WriteLine("CS4 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar(matchA);
timer.Stop();
Console.WriteLine("CC1 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar2(matchA);
timer.Stop();
Console.WriteLine("CC2 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar3(matchA);
timer.Stop();
Console.WriteLine("CC3 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar4(matchA);
timer.Stop();
Console.WriteLine("CC4 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
}
Hasil: CSX sesuai dengan CountSubstrX dan CCX sesuai dengan CountCharX. "chr" mencari string untuk '_', "dan" mencari string untuk "dan", dan "mlw" mencari string untuk "muchlongerword"
CS1 chr: 824.123ms
CS1 and: 586.1893ms
CS1 mlw: 486.5414ms
CS2 chr: 127.8941ms
CS2 and: 806.3918ms
CS2 mlw: 497.318ms
CS3 chr: 201.8896ms
CS3 and: 124.0675ms
CS3 mlw: 212.8341ms
CS4 chr: 81.5183ms
CS4 and: 92.0615ms
CS4 mlw: 116.2197ms
CC1 chr: 66.4078ms
CC2 chr: 64.0161ms
CC3 chr: 65.9013ms
CC4 chr: 65.8206ms
Dan akhirnya, saya punya file dengan 3,6 juta karakter. Itu "derp adfderdserp dfaerpderp deasderp" diulang 100.000 kali. Saya mencari "derp" di dalam file dengan metode di atas 100 kali hasil ini.
CS1Derp: 1501.3444ms
CS2Derp: 1585.797ms
CS3Derp: 376.0937ms
CS4Derp: 271.1663ms
Jadi metode ke-4 saya pasti pemenangnya, tetapi, secara realistis, jika file 3,6 juta karakter 100 kali hanya mengambil 1586 ms sebagai kasus terburuk, maka semua ini cukup dapat diabaikan.
Ngomong-ngomong, saya juga memindai 'd' char dalam file 3,6 juta karakter dengan 100 kali metode CountSubstr dan CountChar. Hasil ...
CS1 d : 2606.9513ms
CS2 d : 339.7942ms
CS3 d : 960.281ms
CS4 d : 233.3442ms
CC1 d : 302.4122ms
CC2 d : 280.7719ms
CC3 d : 299.1125ms
CC4 d : 292.9365ms
Metode poster asli sangat buruk untuk jarum karakter tunggal di tumpukan jerami besar menurut ini.
Catatan: Semua nilai diperbarui ke Keluaran versi keluaran. Saya tidak sengaja lupa untuk membangun mode Rilis setelah pertama kali saya memposting ini. Beberapa pernyataan saya telah diubah.