diff --git a/.gitignore b/.gitignore index 1e4918c..b5b60ae 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ go.work.sum # default build target bloomtool -test/ \ No newline at end of file +test/ +*.bmp diff --git a/README.md b/README.md index 9055a8c..75a5acc 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,7 @@ BloomTool 是一个基于 Go 语言开发的布隆过滤器命令行工具,用 - **低误判率**: 支持自定义误判率,默认千万分之一 - **大容量**: 支持最多 100 亿个元素 - **简单易用**: 命令行界面,操作简单 +- **位图运算**: 支持AND/OR位图运算,实现集合操作 ## 安装 @@ -28,7 +29,7 @@ bloomtool makebloom -d <设备ID文件> -b <位图输出文件> [-e <最大元 **参数:** - `-d`: 设备ID文本文件路径(每行一个ID) -- `-b`: 布隆过滤器位图输出文件路径 +- `-o`: 布隆过滤器位图输出文件路径 - `-e`: 最大元素数量(0表示自动计算,最大100亿) - `-r`: 误判率(0.01-0.000000001,默认0.00000001) @@ -80,7 +81,59 @@ bloomtool info -b <位图文件> bloomtool info -b bloom.blm ``` -### 4. help - 帮助信息 +### 4. and - 位图与运算(求交) + +对两个布隆过滤器位图文件执行AND(与)运算,生成新的位图文件。 + +**注意:对两个文件进行与运算,并不等同于元素求交。请查阅bloomfilter的作用原理。** + +**语法:** +```bash +bloomtool and -b1 <位图文件1> -b2 <位图文件2> -o <输出文件> +``` + +**参数:** +- `-b1`: 第一个布隆过滤器位图文件路径 +- `-b2`: 第二个布隆过滤器位图文件路径 +- `-o`: AND运算结果输出文件路径 + +**说明:** +- AND运算结果包含同时存在于两个布隆过滤器中的元素 +- 要求两个位图文件的参数(元素数量上限和误判率)必须完全一致 +- 运算结果仍然是一个布隆过滤器,可以继续用于其他操作 + +**示例:** +```bash +# 对两个布隆过滤器执行AND运算 +bloomtool and -b1 bloom1.blm -b2 bloom2.blm -b result.blm +``` + +### 5. or - 位图或运算(求并) + +对两个布隆过滤器位图文件执行OR(或)运算,生成新的位图文件。 + +**语法:** +```bash +bloomtool or -b1 <位图文件1> -b2 <位图文件2> -o <输出文件> +``` + +**参数:** +- `-b1`: 第一个布隆过滤器位图文件路径 +- `-b2`: 第二个布隆过滤器位图文件路径 +- `-o`: OR运算结果输出文件路径 + +**说明:** +- OR运算结果包含存在于任一布隆过滤器中的元素 +- 要求两个位图文件的参数(元素数量上限和误判率)必须完全一致 +- 运算结果仍然是一个布隆过滤器,可以继续用于其他操作 + +**示例:** +```bash +# 对两个布隆过滤器执行OR运算 +bloomtool or -b1 bloom1.blm -b2 bloom2.blm -b result.blm +``` + +### 6. help - 帮助信息 显示命令使用帮助。 @@ -112,6 +165,23 @@ bloomtool hittest -d test_devices.txt -b device_bloom.blm -s test_results.txt bloomtool info -b device_bloom.blm ``` +4. **位图运算示例** +```bash +# 创建两个不同的布隆过滤器 +bloomtool makebloom -d devices1.txt -b bloom1.blm +bloomtool makebloom -d devices2.txt -b bloom2.blm + +# AND运算:获取同时存在于两个集合中的元素 +bloomtool and -b1 bloom1.blm -b2 bloom2.blm -b intersection.blm + +# OR运算:获取存在于任一集合中的元素 +bloomtool or -b1 bloom1.blm -b2 bloom2.blm -b union.blm + +# 测试运算结果 +bloomtool hittest -d test_ids.txt -b intersection.blm -s intersection_results.txt +bloomtool hittest -d test_ids.txt -b union.blm -s union_results.txt +``` + ## 文件格式说明 ### 输入文件格式 @@ -128,6 +198,7 @@ bloomtool info -b device_bloom.blm - **查询速度**: 常数时间复杂度的查询操作 - **误判率可控**: 可根据需求调整误判率 - **大容量支持**: 支持海量数据处理 +- **集合运算**: 支持高效的位图AND/OR运算 ## 注意事项 @@ -135,6 +206,7 @@ bloomtool info -b device_bloom.blm 2. 不支持从布隆过滤器中删除元素 3. 位图文件大小与元素数量和误判率相关 4. 建议对输入数据进行哈希处理以提高性能 +5. AND/OR运算要求两个位图文件的参数必须完全一致 ## 依赖项 diff --git a/and.go b/and.go new file mode 100644 index 0000000..e038fd0 --- /dev/null +++ b/and.go @@ -0,0 +1,81 @@ +package main + +import ( +"flag" +"log/slog" + +"git.algo.com.cn/public/bloomtool/internal/bloom" +) + +func RunAnd(args ...string) error { + fs := flag.NewFlagSet("and", flag.ExitOnError) + + bmpFile1 := fs.String("b1", "", "first bitmap filename") + bmpFile2 := fs.String("b2", "", "second bitmap filename") + outFile := fs.String("o", "", "output bitmap filename") + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() > 0 || *bmpFile1 == "" || *bmpFile2 == "" || *outFile == "" { + fs.Usage() + return nil + } + + return andOperation(*bmpFile1, *bmpFile2, *outFile) +} + +func andOperation(bmpFile1, bmpFile2, outFile string) error { + // 加载第一个bitmap文件 + slog.Info("load first bitmap file", "filename", bmpFile1) + bf1, err := bloom.LoadFromFile(bmpFile1, false) + if err != nil { + slog.Error("load first bitmap file error", "err", err) + return err + } + + // 加载第二个bitmap文件 + slog.Info("load second bitmap file", "filename", bmpFile2) + bf2, err := bloom.LoadFromFile(bmpFile2, false) + if err != nil { + slog.Error("load second bitmap file error", "err", err) + return err + } + + // 检查参数一致性 + stat1 := bf1.GetStat() + stat2 := bf2.GetStat() + + slog.Info("first bitmap info", "elements", stat1.ElementsMax, "falseRate", stat1.FalsePositiveRate) + slog.Info("second bitmap info", "elements", stat2.ElementsMax, "falseRate", stat2.FalsePositiveRate) + + if stat1.ElementsMax != stat2.ElementsMax { + slog.Error("elements count mismatch", "file1", stat1.ElementsMax, "file2", stat2.ElementsMax) + return flag.ErrHelp + } + + if stat1.FalsePositiveRate != stat2.FalsePositiveRate { + slog.Error("false positive rate mismatch", "file1", stat1.FalsePositiveRate, "file2", stat2.FalsePositiveRate) + return flag.ErrHelp + } + + // 执行AND运算 + slog.Info("perform AND operation") + result, err := bf1.And(bf2) + if err != nil { + slog.Error("AND operation error", "err", err) + return err + } + + // 保存结果 + slog.Info("save result bitmap file", "filename", outFile) + err = result.SaveToFile(outFile) + if err != nil { + slog.Error("save result bitmap file error", "err", err) + return err + } + + resultStat := result.GetStat() + slog.Info("AND operation completed", "resultElements", resultStat.ElementsAdded) + + return nil +} diff --git a/bitmap_test.go b/bitmap_test.go new file mode 100644 index 0000000..90cfd36 --- /dev/null +++ b/bitmap_test.go @@ -0,0 +1,645 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "testing" + + "git.algo.com.cn/public/bloomtool/internal/bloom" +) + +// TestBloomFilterBasic 测试布隆过滤器基本功能 +func TestBloomFilterBasic(t *testing.T) { + // 创建布隆过滤器 + bf := bloom.NewWithEstimates(1000, 0.01) + + // 测试添加元素 + testData := []string{"apple", "banana", "cherry", "date"} + for _, data := range testData { + bf.AddString(data) + } + + // 刷新缓冲区确保所有元素都被处理 + bf.Flush() + + // 测试存在的元素 + for _, data := range testData { + if !bf.TestString(data) { + t.Errorf("Expected element %s to be present", data) + } + } + + // 测试不存在的元素(可能有假阳性,但不应该总是假阳性) + nonExistentData := []string{"elderberry", "fig", "grape"} + falsePositives := 0 + for _, data := range nonExistentData { + if bf.TestString(data) { + falsePositives++ + } + } + + // 假阳性率应该合理 + falsePositiveRate := float64(falsePositives) / float64(len(nonExistentData)) + if falsePositiveRate > 0.1 { // 允许一定的假阳性率 + t.Errorf("False positive rate too high: %f", falsePositiveRate) + } +} + +// TestBloomFilterStatistics 测试布隆过滤器统计信息 +func TestBloomFilterStatistics(t *testing.T) { + bf := bloom.NewWithEstimates(1000, 0.01) + + // 初始状态 + stat := bf.GetStat() + if stat.ElementsAdded != 0 { + t.Errorf("Expected 0 elements added, got %d", stat.ElementsAdded) + } + if stat.ElementsMax != 1000 { + t.Errorf("Expected 1000 max elements, got %d", stat.ElementsMax) + } + + // 添加元素 + testData := []string{"apple", "banana", "cherry"} + for _, data := range testData { + bf.AddString(data) + } + + // 刷新缓冲区 + bf.Flush() + + // 添加后的状态 + stat = bf.GetStat() + if stat.ElementsAdded != uint64(len(testData)) { + t.Errorf("Expected %d elements added, got %d", len(testData), stat.ElementsAdded) + } +} + +// TestBloomFilterSaveAndLoad 测试文件保存和加载 +func TestBloomFilterSaveAndLoad(t *testing.T) { + // 创建临时目录 + tempDir := t.TempDir() + testFile := filepath.Join(tempDir, "test.bmp") + + // 创建布隆过滤器并添加数据 + originalBf := bloom.NewWithEstimates(1000, 0.01) + testData := []string{"apple", "banana", "cherry", "date", "elderberry"} + for _, data := range testData { + originalBf.AddString(data) + } + + // 确保所有元素都被处理 + originalBf.Flush() + + // 保存到文件 + err := originalBf.SaveToFile(testFile) + if err != nil { + t.Fatalf("Failed to save bloom filter: %v", err) + } + + // 从文件加载 + loadedBf, err := bloom.LoadFromFile(testFile, false) + if err != nil { + t.Fatalf("Failed to load bloom filter: %v", err) + } + + // 验证加载的数据 + originalStat := originalBf.GetStat() + loadedStat := loadedBf.GetStat() + + if originalStat.ElementsAdded != loadedStat.ElementsAdded { + t.Errorf("Elements count mismatch: original=%d, loaded=%d", + originalStat.ElementsAdded, loadedStat.ElementsAdded) + } + if originalStat.ElementsMax != loadedStat.ElementsMax { + t.Errorf("Max elements mismatch: original=%d, loaded=%d", + originalStat.ElementsMax, loadedStat.ElementsMax) + } + + // 验证数据一致性 + for _, data := range testData { + if !loadedBf.TestString(data) { + t.Errorf("Loaded bloom filter missing element: %s", data) + } + } +} + +// TestBloomFilterAndOperation 测试AND操作 +func TestBloomFilterAndOperation(t *testing.T) { + // 创建两个布隆过滤器 + bf1 := bloom.NewWithEstimates(1000, 0.01) + bf2 := bloom.NewWithEstimates(1000, 0.01) + + // 添加数据 + commonData := []string{"apple", "banana"} // 共同元素 + onlyInBf1 := []string{"cherry", "date"} // 只在bf1中 + onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中 + + for _, data := range commonData { + bf1.AddString(data) + bf2.AddString(data) + } + + for _, data := range onlyInBf1 { + bf1.AddString(data) + } + + for _, data := range onlyInBf2 { + bf2.AddString(data) + } + + // 刷新缓冲区 + bf1.Flush() + bf2.Flush() + + // 执行AND操作 + result, err := bf1.And(bf2) + if err != nil { + t.Fatalf("AND operation failed: %v", err) + } + + // 验证结果:共同元素应该存在 + missingCommon := false + for _, data := range commonData { + if !result.TestString(data) { + missingCommon = true + t.Logf("Common element %s missing from AND result", data) + } + } + + // 由于布隆过滤器的假阳性特性,我们检查是否有共同元素存在 + if missingCommon { + t.Error("Not all common elements found in AND result") + } + + // 验证AND操作确实减少了某些元素 + bf1Test := bf1.TestString("cherry") + resultTest := result.TestString("cherry") + + if bf1Test && !resultTest { + t.Log("AND operation correctly removed element present in only one filter") + } +} + +// TestBloomFilterOrOperation 测试OR操作 +func TestBloomFilterOrOperation(t *testing.T) { + // 创建两个布隆过滤器 + bf1 := bloom.NewWithEstimates(1000, 0.01) + bf2 := bloom.NewWithEstimates(1000, 0.01) + + // 添加数据 + commonData := []string{"apple", "banana"} // 共同元素 + onlyInBf1 := []string{"cherry", "date"} // 只在bf1中 + onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中 + + allData := append(append(commonData, onlyInBf1...), onlyInBf2...) + + for _, data := range commonData { + bf1.AddString(data) + bf2.AddString(data) + } + + for _, data := range onlyInBf1 { + bf1.AddString(data) + } + + for _, data := range onlyInBf2 { + bf2.AddString(data) + } + + // 刷新缓冲区 + bf1.Flush() + bf2.Flush() + + // 执行OR操作 + result, err := bf1.Or(bf2) + if err != nil { + t.Fatalf("OR operation failed: %v", err) + } + + // 验证结果:所有元素应该存在 + missingElements := 0 + for _, data := range allData { + if !result.TestString(data) { + missingElements++ + t.Logf("Element %s missing from OR result", data) + } + } + + // 由于布隆过滤器的特性,允许少量元素缺失 + if missingElements > len(allData)/2 { + t.Errorf("Too many elements (%d/%d) missing from OR result", missingElements, len(allData)) + } + + // 验证OR操作确实增加了某些元素 + resultTest := result.TestString("cherry") + bf1Test := bf1.TestString("cherry") + + if bf1Test && resultTest { + t.Log("OR operation correctly preserved element from first filter") + } +} + +// TestBloomFilterOperationErrors 测试操作错误处理 +func TestBloomFilterOperationErrors(t *testing.T) { + // 创建两个不兼容的布隆过滤器(不同的误判率) + bf1 := bloom.NewWithEstimates(1000, 0.01) + bf2 := bloom.NewWithEstimates(1000, 0.02) // 不同的误判率 + + // 测试AND操作错误 + _, err := bf1.And(bf2) + if err == nil { + t.Error("Expected AND operation to fail with different false positive rates") + } + + // 测试OR操作错误 + _, err = bf1.Or(bf2) + if err == nil { + t.Error("Expected OR operation to fail with different false positive rates") + } + + // 创建不同参数数量的过滤器(这会导致不同的k值) + bf3 := bloom.NewWithEstimates(2000, 0.01) // 不同的最大元素数 + + // 测试AND操作错误 + _, err = bf1.And(bf3) + if err == nil { + t.Log("Note: Different element counts resulted in same k value, AND operation succeeded") + } else { + t.Log("AND operation correctly failed with different parameters") + } + + // 测试OR操作错误 + _, err = bf1.Or(bf3) + if err == nil { + t.Log("Note: Different element counts resulted in same k value, OR operation succeeded") + } else { + t.Log("OR operation correctly failed with different parameters") + } +} + +// TestRunMakeBloom 测试makebloom命令 +func TestRunMakeBloom(t *testing.T) { + // 创建临时目录 + tempDir := t.TempDir() + + // 创建测试数据文件 + dataFile := filepath.Join(tempDir, "test_data.txt") + outputFile := filepath.Join(tempDir, "test_output.bmp") + + testData := "apple\nbanana\ncherry\ndate\nelderberry\n" + err := os.WriteFile(dataFile, []byte(testData), 0644) + if err != nil { + t.Fatalf("Failed to create test data file: %v", err) + } + + // 运行makebloom命令 + err = RunMakeBloom("-d", dataFile, "-o", outputFile, "-e", "1000", "-r", "0.01") + if err != nil { + t.Fatalf("RunMakeBloom failed: %v", err) + } + + // 验证输出文件存在 + if _, err := os.Stat(outputFile); os.IsNotExist(err) { + t.Error("Output file was not created") + } + + // 加载并验证bitmap + bf, err := bloom.LoadFromFile(outputFile, false) + if err != nil { + t.Fatalf("Failed to load created bitmap: %v", err) + } + + // 验证数据 + lines := strings.Split(strings.TrimSpace(testData), "\n") + for _, line := range lines { + if !bf.TestString(line) { + t.Errorf("Element %s not found in created bitmap", line) + } + } +} + +// TestRunHitTest 测试hittest命令 +func TestRunHitTest(t *testing.T) { + // 创建临时目录 + tempDir := t.TempDir() + + // 创建测试bitmap + bitmapFile := filepath.Join(tempDir, "test.bmp") + dataFile := filepath.Join(tempDir, "test_data.txt") + resultFile := filepath.Join(tempDir, "test_result.txt") + + // 创建原始数据 + originalData := []string{"apple", "banana", "cherry", "date"} + + // 创建bitmap + bf := bloom.NewWithEstimates(1000, 0.01) + for _, data := range originalData { + bf.AddString(data) + } + bf.Flush() + + err := bf.SaveToFile(bitmapFile) + if err != nil { + t.Fatalf("Failed to save bitmap: %v", err) + } + + // 创建测试数据文件 + testData := "apple\nbanana\nelderberry\nfig\n" + err = os.WriteFile(dataFile, []byte(testData), 0644) + if err != nil { + t.Fatalf("Failed to create test data file: %v", err) + } + + // 运行hittest命令 + err = RunHitTest("-d", dataFile, "-b", bitmapFile, "-o", resultFile) + if err != nil { + t.Fatalf("RunHitTest failed: %v", err) + } + + // 验证结果文件 + resultContent, err := os.ReadFile(resultFile) + if err != nil { + t.Fatalf("Failed to read result file: %v", err) + } + + lines := strings.Split(strings.TrimSpace(string(resultContent)), "\n") + + // 应该找到apple和banana,找不到elderberry和fig + expectedHits := 2 + actualHits := 0 + for _, line := range lines { + if strings.Contains(line, "apple") || strings.Contains(line, "banana") { + actualHits++ + } + } + + if actualHits != expectedHits { + t.Errorf("Expected %d hits, got %d", expectedHits, actualHits) + } +} + +// TestRunInfo 测试info命令 +func TestRunInfo(t *testing.T) { + // 创建临时目录 + tempDir := t.TempDir() + bitmapFile := filepath.Join(tempDir, "test.bmp") + + // 创建测试bitmap + bf := bloom.NewWithEstimates(1000, 0.01) + bf.AddString("apple") + bf.AddString("banana") + + err := bf.SaveToFile(bitmapFile) + if err != nil { + t.Fatalf("Failed to save bitmap: %v", err) + } + + // 运行info命令(这里只是测试不报错,实际输出需要手动验证) + err = RunInfo("-b", bitmapFile) + if err != nil { + t.Errorf("RunInfo failed: %v", err) + } +} + +// TestRunAnd 测试and命令 +func TestRunAnd(t *testing.T) { + // 创建临时目录 + tempDir := t.TempDir() + + // 创建两个bitmap文件 + bitmapFile1 := filepath.Join(tempDir, "test1.bmp") + bitmapFile2 := filepath.Join(tempDir, "test2.bmp") + outputFile := filepath.Join(tempDir, "and_result.bmp") + + // 创建第一个bitmap + bf1 := bloom.NewWithEstimates(1000, 0.01) + bf1.AddString("apple") + bf1.AddString("banana") + bf1.AddString("cherry") + bf1.Flush() + + err := bf1.SaveToFile(bitmapFile1) + if err != nil { + t.Fatalf("Failed to save bitmap1: %v", err) + } + + // 创建第二个bitmap + bf2 := bloom.NewWithEstimates(1000, 0.01) + bf2.AddString("banana") + bf2.AddString("cherry") + bf2.AddString("date") + bf2.Flush() + + err = bf2.SaveToFile(bitmapFile2) + if err != nil { + t.Fatalf("Failed to save bitmap2: %v", err) + } + + // 运行and命令 + err = RunAnd("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile) + if err != nil { + t.Fatalf("RunAnd failed: %v", err) + } + + // 验证结果 + resultBf, err := bloom.LoadFromFile(outputFile, false) + if err != nil { + t.Fatalf("Failed to load result bitmap: %v", err) + } + + // banana和cherry应该存在 + if !resultBf.TestString("banana") || !resultBf.TestString("cherry") { + t.Error("Expected common elements missing from AND result") + } +} + +// TestRunOr 测试or命令 +func TestRunOr(t *testing.T) { + // 创建临时目录 + tempDir := t.TempDir() + + // 创建两个bitmap文件 + bitmapFile1 := filepath.Join(tempDir, "test1.bmp") + bitmapFile2 := filepath.Join(tempDir, "test2.bmp") + outputFile := filepath.Join(tempDir, "or_result.bmp") + + // 创建第一个bitmap + bf1 := bloom.NewWithEstimates(1000, 0.01) + bf1.AddString("apple") + bf1.AddString("banana") + bf1.Flush() + + err := bf1.SaveToFile(bitmapFile1) + if err != nil { + t.Fatalf("Failed to save bitmap1: %v", err) + } + + // 创建第二个bitmap + bf2 := bloom.NewWithEstimates(1000, 0.01) + bf2.AddString("cherry") + bf2.AddString("date") + bf2.Flush() + + err = bf2.SaveToFile(bitmapFile2) + if err != nil { + t.Fatalf("Failed to save bitmap2: %v", err) + } + + // 运行or命令 + err = RunOr("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile) + if err != nil { + t.Fatalf("RunOr failed: %v", err) + } + + // 验证结果 + resultBf, err := bloom.LoadFromFile(outputFile, false) + if err != nil { + t.Fatalf("Failed to load result bitmap: %v", err) + } + + // 所有元素应该存在 + testElements := []string{"apple", "banana", "cherry", "date"} + for _, element := range testElements { + if !resultBf.TestString(element) { + t.Errorf("Element %s missing from OR result", element) + } + } +} + +// TestBloomFilterEdgeCases 测试边界情况 +func TestBloomFilterEdgeCases(t *testing.T) { + // 测试空字符串 + bf := bloom.NewWithEstimates(100, 0.01) + bf.AddString("") + bf.Flush() + + if !bf.TestString("") { + t.Error("Empty string should be testable") + } + + // 测试长字符串 + longString := strings.Repeat("a", 10000) + bf.AddString(longString) + bf.Flush() + + if !bf.TestString(longString) { + t.Error("Long string should be testable") + } + + // 测试特殊字符 + specialString := "测试中文字符符!@#$%^&*()" + bf.AddString(specialString) + bf.Flush() + + if !bf.TestString(specialString) { + t.Error("Special characters should be testable") + } +} + +// TestBloomFilterConcurrency 测试并发安全性 +func TestBloomFilterConcurrency(t *testing.T) { + bf := bloom.NewWithEstimates(10000, 0.01) + + // 使用多个goroutine并发添加元素 + done := make(chan bool, 10) + for i := 0; i < 10; i++ { + go func(id int) { + for j := 0; j < 100; j++ { + data := fmt.Sprintf("item_%d_%d", id, j) + bf.AddString(data) + } + done <- true + }(i) + } + + // 等待所有goroutine完成 + for i := 0; i < 10; i++ { + <-done + } + + // 刷新所有缓冲区 + bf.Flush() + + // 验证所有元素都存在 + for i := 0; i < 10; i++ { + for j := 0; j < 100; j++ { + data := fmt.Sprintf("item_%d_%d", i, j) + if !bf.TestString(data) { + t.Errorf("Concurrent element %s not found", data) + } + } + } +} + +// BenchmarkBloomFilterAdd 性能测试:添加元素 +func BenchmarkBloomFilterAdd(b *testing.B) { + bf := bloom.NewWithEstimates(1000000, 0.01) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + bf.AddString(fmt.Sprintf("item_%d", i)) + } + bf.Flush() +} + +// BenchmarkBloomFilterTest 性能测试:测试元素 +func BenchmarkBloomFilterTest(b *testing.B) { + bf := bloom.NewWithEstimates(100000, 0.01) + + // 预先添加一些元素 + for i := 0; i < 1000; i++ { + bf.AddString(fmt.Sprintf("item_%d", i)) + } + bf.Flush() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + bf.TestString(fmt.Sprintf("item_%d", i%1000)) + } +} + +// BenchmarkBloomFilterAndOperation 性能测试:AND操作 +func BenchmarkBloomFilterAndOperation(b *testing.B) { + bf1 := bloom.NewWithEstimates(10000, 0.01) + bf2 := bloom.NewWithEstimates(10000, 0.01) + + // 预先添加元素 + for i := 0; i < 1000; i++ { + bf1.AddString(fmt.Sprintf("item_%d", i)) + bf2.AddString(fmt.Sprintf("item_%d", i)) + } + bf1.Flush() + bf2.Flush() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := bf1.And(bf2) + if err != nil { + b.Fatal(err) + } + } +} + +// BenchmarkBloomFilterOrOperation 性能测试:OR操作 +func BenchmarkBloomFilterOrOperation(b *testing.B) { + bf1 := bloom.NewWithEstimates(10000, 0.01) + bf2 := bloom.NewWithEstimates(10000, 0.01) + + // 预先添加元素 + for i := 0; i < 1000; i++ { + bf1.AddString(fmt.Sprintf("item_%d", i)) + bf2.AddString(fmt.Sprintf("item_%d", i+1000)) + } + bf1.Flush() + bf2.Flush() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := bf1.Or(bf2) + if err != nil { + b.Fatal(err) + } + } +} diff --git a/help.go b/help.go index 0cb8b7c..429a0dc 100644 --- a/help.go +++ b/help.go @@ -19,6 +19,8 @@ The commands are: makebloom Make bloom filter bitmap file hittest Hittest text lines in bitmap info Show bitmap file info + and AND operation between two bitmap files + or OR operation between two bitmap files "help" is the default command. diff --git a/hittest.go b/hittest.go index 8457119..ff56666 100644 --- a/hittest.go +++ b/hittest.go @@ -15,18 +15,18 @@ func RunHitTest(args ...string) error { txtFile := fs.String("d", "", "device id filename") bmpFile := fs.String("b", "", "bitmap filename") - stateFile := fs.String("s", "", "state filename for output") + outStateFile := fs.String("o", "", "state filename for output") filter := fs.Bool("f", false, "filter for hit only") if err := fs.Parse(args); err != nil { return err - } else if fs.NArg() > 0 || *txtFile == "" || *bmpFile == "" || *stateFile == "" { + } else if fs.NArg() > 0 || *txtFile == "" || *bmpFile == "" || *outStateFile == "" { fmt.Println(fs.NArg()) fs.Usage() return nil } - return hitTest(*txtFile, *bmpFile, *stateFile, *filter) + return hitTest(*txtFile, *bmpFile, *outStateFile, *filter) } func hitTest(txtFile, bmpFile, stateFile string, filter bool) error { diff --git a/internal/bloom/bloom.go b/internal/bloom/bloom.go index 180d7c6..85edfff 100644 --- a/internal/bloom/bloom.go +++ b/internal/bloom/bloom.go @@ -343,3 +343,55 @@ func LoadFromFile(filename string, headerOnly bool) (bft *BloomFilter, err error return bft, nil } + +// And 两个布隆过滤器进行AND运算,返回新的布隆过滤器 +func (b *BloomFilter) And(other *BloomFilter) (*BloomFilter, error) { + // 检查参数一致性 + if b.k != other.k || b.falsePositiveRate != other.falsePositiveRate { + return nil, errors.New("bloom filters must have same k and falsePositiveRate for AND operation") + } + + // 创建新的布隆过滤器 + result := newBloomFilter(b.m, b.k, b.elementsMax, b.falsePositiveRate) + + // 执行AND运算 + result.rb = roaring64.And(b.rb, other.rb) + + // 计算AND后的元素数量(这是一个估计值,因为AND操作后元素数量无法精确计算) + // 使用两个过滤器中较小的元素数量作为估计 + if b.elementsAdded < other.elementsAdded { + result.elementsAdded = b.elementsAdded + } else { + result.elementsAdded = other.elementsAdded + } + + return result, nil +} + +// Or 两个布隆过滤器进行OR运算,返回新的布隆过滤器 +func (b *BloomFilter) Or(other *BloomFilter) (*BloomFilter, error) { + // 检查参数一致性 + if b.k != other.k || b.falsePositiveRate != other.falsePositiveRate { + return nil, errors.New("bloom filters must have same k and falsePositiveRate for OR operation") + } + + // 创建新的布隆过滤器 + result := newBloomFilter(b.m, b.k, b.elementsMax, b.falsePositiveRate) + + // 执行OR运算 + result.rb = roaring64.Or(b.rb, other.rb) + + // 计算OR后的元素数量(这是一个估计值) + // 使用两个过滤器中较大的元素数量作为估计,但不能超过最大容量 + if b.elementsAdded > other.elementsAdded { + result.elementsAdded = b.elementsAdded + } else { + result.elementsAdded = other.elementsAdded + } + + if result.elementsAdded > result.elementsMax { + result.elementsAdded = result.elementsMax + } + + return result, nil +} diff --git a/main.go b/main.go index 7b68611..038b9e3 100644 --- a/main.go +++ b/main.go @@ -25,6 +25,10 @@ func Run(args ...string) error { return RunHitTest(args...) case "info": return RunInfo(args...) + case "and": + return RunAnd(args...) + case "or": + return RunOr(args...) default: err := fmt.Errorf(`unknown command "%s"`+"\n"+`Run 'bloomtool help' for usage`, name) slog.Warn(err.Error()) diff --git a/makebloom.go b/makebloom.go index 8fb4243..293273b 100644 --- a/makebloom.go +++ b/makebloom.go @@ -17,23 +17,23 @@ func RunMakeBloom(args ...string) error { fs := flag.NewFlagSet("makebloom", flag.ExitOnError) txtFile := fs.String("d", "", "device id filename") - bmpFile := fs.String("b", "", "bitmap filename for output") + outFile := fs.String("o", "", "output bitmap filename") elements := fs.Uint64("e", 0, "max elements. (max 100 0000 0000). if 0 then auto") falseRate := fs.Float64("r", FalseRate, "false rate (0.01--0.0000 0000 1)") if err := fs.Parse(args); err != nil { return err - } else if fs.NArg() > 0 || *txtFile == "" || *bmpFile == "" || + } else if fs.NArg() > 0 || *txtFile == "" || *outFile == "" || *elements > 10000000000 || *falseRate > 0.01 || *falseRate < 0.000000001 { fs.Usage() return nil } - return makeBloom(*txtFile, *bmpFile, *elements, *falseRate) + return makeBloom(*txtFile, *outFile, *elements, *falseRate) } -func makeBloom(txtFile string, bmpFile string, elements uint64, falseRate float64) error { +func makeBloom(txtFile string, outFile string, elements uint64, falseRate float64) error { // 打开设备号文件 slog.Info("open source file", "filename", txtFile) tfile, err := os.Open(txtFile) @@ -76,8 +76,8 @@ func makeBloom(txtFile string, bmpFile string, elements uint64, falseRate float6 } // 保存文件 - slog.Info("save bitmap file", "filename", bmpFile) - err = bloombmp.SaveToFile(bmpFile) + slog.Info("save bitmap file", "filename", outFile) + err = bloombmp.SaveToFile(outFile) if err != nil { slog.Error("save bitmap file error", "err", err) diff --git a/or.go b/or.go new file mode 100644 index 0000000..636dc7d --- /dev/null +++ b/or.go @@ -0,0 +1,81 @@ +package main + +import ( +"flag" +"log/slog" + +"git.algo.com.cn/public/bloomtool/internal/bloom" +) + +func RunOr(args ...string) error { + fs := flag.NewFlagSet("or", flag.ExitOnError) + + bmpFile1 := fs.String("b1", "", "first bitmap filename") + bmpFile2 := fs.String("b2", "", "second bitmap filename") + outFile := fs.String("o", "", "output bitmap filename") + + if err := fs.Parse(args); err != nil { + return err + } else if fs.NArg() > 0 || *bmpFile1 == "" || *bmpFile2 == "" || *outFile == "" { + fs.Usage() + return nil + } + + return orOperation(*bmpFile1, *bmpFile2, *outFile) +} + +func orOperation(bmpFile1, bmpFile2, outFile string) error { + // 加载第一个bitmap文件 + slog.Info("load first bitmap file", "filename", bmpFile1) + bf1, err := bloom.LoadFromFile(bmpFile1, false) + if err != nil { + slog.Error("load first bitmap file error", "err", err) + return err + } + + // 加载第二个bitmap文件 + slog.Info("load second bitmap file", "filename", bmpFile2) + bf2, err := bloom.LoadFromFile(bmpFile2, false) + if err != nil { + slog.Error("load second bitmap file error", "err", err) + return err + } + + // 检查参数一致性 + stat1 := bf1.GetStat() + stat2 := bf2.GetStat() + + slog.Info("first bitmap info", "elements", stat1.ElementsMax, "falseRate", stat1.FalsePositiveRate) + slog.Info("second bitmap info", "elements", stat2.ElementsMax, "falseRate", stat2.FalsePositiveRate) + + if stat1.ElementsMax != stat2.ElementsMax { + slog.Error("elements count mismatch", "file1", stat1.ElementsMax, "file2", stat2.ElementsMax) + return flag.ErrHelp + } + + if stat1.FalsePositiveRate != stat2.FalsePositiveRate { + slog.Error("false positive rate mismatch", "file1", stat1.FalsePositiveRate, "file2", stat2.FalsePositiveRate) + return flag.ErrHelp + } + + // 执行OR运算 + slog.Info("perform OR operation") + result, err := bf1.Or(bf2) + if err != nil { + slog.Error("OR operation error", "err", err) + return err + } + + // 保存结果 + slog.Info("save result bitmap file", "filename", outFile) + err = result.SaveToFile(outFile) + if err != nil { + slog.Error("save result bitmap file error", "err", err) + return err + } + + resultStat := result.GetStat() + slog.Info("OR operation completed", "resultElements", resultStat.ElementsAdded) + + return nil +}