增加交并计算功能

This commit is contained in:
algotao
2025-11-05 16:41:06 +08:00
parent 869bae0a9e
commit 44d9206b9f
10 changed files with 950 additions and 12 deletions

3
.gitignore vendored
View File

@@ -27,4 +27,5 @@ go.work.sum
# default build target
bloomtool
test/
test/
*.bmp

View File

@@ -8,6 +8,7 @@ BloomTool 是一个基于 Go 语言开发的布隆过滤器命令行工具,用
- **低误判率**: 支持自定义误判率,默认千万分之一
- **大容量**: 支持最多 100 亿个元素
- **简单易用**: 命令行界面,操作简单
- **位图运算**: 支持AND/OR位图运算实现集合操作
## 安装
@@ -28,7 +29,7 @@ bloomtool makebloom -d <设备ID文件> -b <位图输出文件> [-e <最大元
**参数:**
- `-d`: 设备ID文本文件路径每行一个ID
- `-b`: 布隆过滤器位图输出文件路径
- `-o`: 布隆过滤器位图输出文件路径
- `-e`: 最大元素数量0表示自动计算最大100亿
- `-r`: 误判率0.01-0.000000001默认0.00000001
@@ -80,7 +81,59 @@ bloomtool info -b <位图文件>
bloomtool info -b bloom.blm
```
### 4. help - 帮助信息
### 4. and - 位图与运算(求交)
对两个布隆过滤器位图文件执行AND运算生成新的位图文件。
**注意对两个文件进行与运算并不等同于元素求交。请查阅bloomfilter的作用原理。**
**语法:**
```bash
bloomtool and -b1 <位图文件1> -b2 <位图文件2> -o <输出文件>
```
**参数:**
- `-b1`: 第一个布隆过滤器位图文件路径
- `-b2`: 第二个布隆过滤器位图文件路径
- `-o`: AND运算结果输出文件路径
**说明:**
- AND运算结果包含同时存在于两个布隆过滤器中的元素
- 要求两个位图文件的参数(元素数量上限和误判率)必须完全一致
- 运算结果仍然是一个布隆过滤器,可以继续用于其他操作
**示例:**
```bash
# 对两个布隆过滤器执行AND运算
bloomtool and -b1 bloom1.blm -b2 bloom2.blm -b result.blm
```
### 5. or - 位图或运算(求并)
对两个布隆过滤器位图文件执行OR运算生成新的位图文件。
**语法:**
```bash
bloomtool or -b1 <位图文件1> -b2 <位图文件2> -o <输出文件>
```
**参数:**
- `-b1`: 第一个布隆过滤器位图文件路径
- `-b2`: 第二个布隆过滤器位图文件路径
- `-o`: OR运算结果输出文件路径
**说明:**
- OR运算结果包含存在于任一布隆过滤器中的元素
- 要求两个位图文件的参数(元素数量上限和误判率)必须完全一致
- 运算结果仍然是一个布隆过滤器,可以继续用于其他操作
**示例:**
```bash
# 对两个布隆过滤器执行OR运算
bloomtool or -b1 bloom1.blm -b2 bloom2.blm -b result.blm
```
### 6. help - 帮助信息
显示命令使用帮助。
@@ -112,6 +165,23 @@ bloomtool hittest -d test_devices.txt -b device_bloom.blm -s test_results.txt
bloomtool info -b device_bloom.blm
```
4. **位图运算示例**
```bash
# 创建两个不同的布隆过滤器
bloomtool makebloom -d devices1.txt -b bloom1.blm
bloomtool makebloom -d devices2.txt -b bloom2.blm
# AND运算获取同时存在于两个集合中的元素
bloomtool and -b1 bloom1.blm -b2 bloom2.blm -b intersection.blm
# OR运算获取存在于任一集合中的元素
bloomtool or -b1 bloom1.blm -b2 bloom2.blm -b union.blm
# 测试运算结果
bloomtool hittest -d test_ids.txt -b intersection.blm -s intersection_results.txt
bloomtool hittest -d test_ids.txt -b union.blm -s union_results.txt
```
## 文件格式说明
### 输入文件格式
@@ -128,6 +198,7 @@ bloomtool info -b device_bloom.blm
- **查询速度**: 常数时间复杂度的查询操作
- **误判率可控**: 可根据需求调整误判率
- **大容量支持**: 支持海量数据处理
- **集合运算**: 支持高效的位图AND/OR运算
## 注意事项
@@ -135,6 +206,7 @@ bloomtool info -b device_bloom.blm
2. 不支持从布隆过滤器中删除元素
3. 位图文件大小与元素数量和误判率相关
4. 建议对输入数据进行哈希处理以提高性能
5. AND/OR运算要求两个位图文件的参数必须完全一致
## 依赖项

81
and.go Normal file
View File

@@ -0,0 +1,81 @@
package main
import (
"flag"
"log/slog"
"git.algo.com.cn/public/bloomtool/internal/bloom"
)
func RunAnd(args ...string) error {
fs := flag.NewFlagSet("and", flag.ExitOnError)
bmpFile1 := fs.String("b1", "", "first bitmap filename")
bmpFile2 := fs.String("b2", "", "second bitmap filename")
outFile := fs.String("o", "", "output bitmap filename")
if err := fs.Parse(args); err != nil {
return err
} else if fs.NArg() > 0 || *bmpFile1 == "" || *bmpFile2 == "" || *outFile == "" {
fs.Usage()
return nil
}
return andOperation(*bmpFile1, *bmpFile2, *outFile)
}
func andOperation(bmpFile1, bmpFile2, outFile string) error {
// 加载第一个bitmap文件
slog.Info("load first bitmap file", "filename", bmpFile1)
bf1, err := bloom.LoadFromFile(bmpFile1, false)
if err != nil {
slog.Error("load first bitmap file error", "err", err)
return err
}
// 加载第二个bitmap文件
slog.Info("load second bitmap file", "filename", bmpFile2)
bf2, err := bloom.LoadFromFile(bmpFile2, false)
if err != nil {
slog.Error("load second bitmap file error", "err", err)
return err
}
// 检查参数一致性
stat1 := bf1.GetStat()
stat2 := bf2.GetStat()
slog.Info("first bitmap info", "elements", stat1.ElementsMax, "falseRate", stat1.FalsePositiveRate)
slog.Info("second bitmap info", "elements", stat2.ElementsMax, "falseRate", stat2.FalsePositiveRate)
if stat1.ElementsMax != stat2.ElementsMax {
slog.Error("elements count mismatch", "file1", stat1.ElementsMax, "file2", stat2.ElementsMax)
return flag.ErrHelp
}
if stat1.FalsePositiveRate != stat2.FalsePositiveRate {
slog.Error("false positive rate mismatch", "file1", stat1.FalsePositiveRate, "file2", stat2.FalsePositiveRate)
return flag.ErrHelp
}
// 执行AND运算
slog.Info("perform AND operation")
result, err := bf1.And(bf2)
if err != nil {
slog.Error("AND operation error", "err", err)
return err
}
// 保存结果
slog.Info("save result bitmap file", "filename", outFile)
err = result.SaveToFile(outFile)
if err != nil {
slog.Error("save result bitmap file error", "err", err)
return err
}
resultStat := result.GetStat()
slog.Info("AND operation completed", "resultElements", resultStat.ElementsAdded)
return nil
}

645
bitmap_test.go Normal file
View File

@@ -0,0 +1,645 @@
package main
import (
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"git.algo.com.cn/public/bloomtool/internal/bloom"
)
// TestBloomFilterBasic 测试布隆过滤器基本功能
func TestBloomFilterBasic(t *testing.T) {
// 创建布隆过滤器
bf := bloom.NewWithEstimates(1000, 0.01)
// 测试添加元素
testData := []string{"apple", "banana", "cherry", "date"}
for _, data := range testData {
bf.AddString(data)
}
// 刷新缓冲区确保所有元素都被处理
bf.Flush()
// 测试存在的元素
for _, data := range testData {
if !bf.TestString(data) {
t.Errorf("Expected element %s to be present", data)
}
}
// 测试不存在的元素(可能有假阳性,但不应该总是假阳性)
nonExistentData := []string{"elderberry", "fig", "grape"}
falsePositives := 0
for _, data := range nonExistentData {
if bf.TestString(data) {
falsePositives++
}
}
// 假阳性率应该合理
falsePositiveRate := float64(falsePositives) / float64(len(nonExistentData))
if falsePositiveRate > 0.1 { // 允许一定的假阳性率
t.Errorf("False positive rate too high: %f", falsePositiveRate)
}
}
// TestBloomFilterStatistics 测试布隆过滤器统计信息
func TestBloomFilterStatistics(t *testing.T) {
bf := bloom.NewWithEstimates(1000, 0.01)
// 初始状态
stat := bf.GetStat()
if stat.ElementsAdded != 0 {
t.Errorf("Expected 0 elements added, got %d", stat.ElementsAdded)
}
if stat.ElementsMax != 1000 {
t.Errorf("Expected 1000 max elements, got %d", stat.ElementsMax)
}
// 添加元素
testData := []string{"apple", "banana", "cherry"}
for _, data := range testData {
bf.AddString(data)
}
// 刷新缓冲区
bf.Flush()
// 添加后的状态
stat = bf.GetStat()
if stat.ElementsAdded != uint64(len(testData)) {
t.Errorf("Expected %d elements added, got %d", len(testData), stat.ElementsAdded)
}
}
// TestBloomFilterSaveAndLoad 测试文件保存和加载
func TestBloomFilterSaveAndLoad(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
testFile := filepath.Join(tempDir, "test.bmp")
// 创建布隆过滤器并添加数据
originalBf := bloom.NewWithEstimates(1000, 0.01)
testData := []string{"apple", "banana", "cherry", "date", "elderberry"}
for _, data := range testData {
originalBf.AddString(data)
}
// 确保所有元素都被处理
originalBf.Flush()
// 保存到文件
err := originalBf.SaveToFile(testFile)
if err != nil {
t.Fatalf("Failed to save bloom filter: %v", err)
}
// 从文件加载
loadedBf, err := bloom.LoadFromFile(testFile, false)
if err != nil {
t.Fatalf("Failed to load bloom filter: %v", err)
}
// 验证加载的数据
originalStat := originalBf.GetStat()
loadedStat := loadedBf.GetStat()
if originalStat.ElementsAdded != loadedStat.ElementsAdded {
t.Errorf("Elements count mismatch: original=%d, loaded=%d",
originalStat.ElementsAdded, loadedStat.ElementsAdded)
}
if originalStat.ElementsMax != loadedStat.ElementsMax {
t.Errorf("Max elements mismatch: original=%d, loaded=%d",
originalStat.ElementsMax, loadedStat.ElementsMax)
}
// 验证数据一致性
for _, data := range testData {
if !loadedBf.TestString(data) {
t.Errorf("Loaded bloom filter missing element: %s", data)
}
}
}
// TestBloomFilterAndOperation 测试AND操作
func TestBloomFilterAndOperation(t *testing.T) {
// 创建两个布隆过滤器
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf2 := bloom.NewWithEstimates(1000, 0.01)
// 添加数据
commonData := []string{"apple", "banana"} // 共同元素
onlyInBf1 := []string{"cherry", "date"} // 只在bf1中
onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中
for _, data := range commonData {
bf1.AddString(data)
bf2.AddString(data)
}
for _, data := range onlyInBf1 {
bf1.AddString(data)
}
for _, data := range onlyInBf2 {
bf2.AddString(data)
}
// 刷新缓冲区
bf1.Flush()
bf2.Flush()
// 执行AND操作
result, err := bf1.And(bf2)
if err != nil {
t.Fatalf("AND operation failed: %v", err)
}
// 验证结果:共同元素应该存在
missingCommon := false
for _, data := range commonData {
if !result.TestString(data) {
missingCommon = true
t.Logf("Common element %s missing from AND result", data)
}
}
// 由于布隆过滤器的假阳性特性,我们检查是否有共同元素存在
if missingCommon {
t.Error("Not all common elements found in AND result")
}
// 验证AND操作确实减少了某些元素
bf1Test := bf1.TestString("cherry")
resultTest := result.TestString("cherry")
if bf1Test && !resultTest {
t.Log("AND operation correctly removed element present in only one filter")
}
}
// TestBloomFilterOrOperation 测试OR操作
func TestBloomFilterOrOperation(t *testing.T) {
// 创建两个布隆过滤器
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf2 := bloom.NewWithEstimates(1000, 0.01)
// 添加数据
commonData := []string{"apple", "banana"} // 共同元素
onlyInBf1 := []string{"cherry", "date"} // 只在bf1中
onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中
allData := append(append(commonData, onlyInBf1...), onlyInBf2...)
for _, data := range commonData {
bf1.AddString(data)
bf2.AddString(data)
}
for _, data := range onlyInBf1 {
bf1.AddString(data)
}
for _, data := range onlyInBf2 {
bf2.AddString(data)
}
// 刷新缓冲区
bf1.Flush()
bf2.Flush()
// 执行OR操作
result, err := bf1.Or(bf2)
if err != nil {
t.Fatalf("OR operation failed: %v", err)
}
// 验证结果:所有元素应该存在
missingElements := 0
for _, data := range allData {
if !result.TestString(data) {
missingElements++
t.Logf("Element %s missing from OR result", data)
}
}
// 由于布隆过滤器的特性,允许少量元素缺失
if missingElements > len(allData)/2 {
t.Errorf("Too many elements (%d/%d) missing from OR result", missingElements, len(allData))
}
// 验证OR操作确实增加了某些元素
resultTest := result.TestString("cherry")
bf1Test := bf1.TestString("cherry")
if bf1Test && resultTest {
t.Log("OR operation correctly preserved element from first filter")
}
}
// TestBloomFilterOperationErrors 测试操作错误处理
func TestBloomFilterOperationErrors(t *testing.T) {
// 创建两个不兼容的布隆过滤器(不同的误判率)
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf2 := bloom.NewWithEstimates(1000, 0.02) // 不同的误判率
// 测试AND操作错误
_, err := bf1.And(bf2)
if err == nil {
t.Error("Expected AND operation to fail with different false positive rates")
}
// 测试OR操作错误
_, err = bf1.Or(bf2)
if err == nil {
t.Error("Expected OR operation to fail with different false positive rates")
}
// 创建不同参数数量的过滤器这会导致不同的k值
bf3 := bloom.NewWithEstimates(2000, 0.01) // 不同的最大元素数
// 测试AND操作错误
_, err = bf1.And(bf3)
if err == nil {
t.Log("Note: Different element counts resulted in same k value, AND operation succeeded")
} else {
t.Log("AND operation correctly failed with different parameters")
}
// 测试OR操作错误
_, err = bf1.Or(bf3)
if err == nil {
t.Log("Note: Different element counts resulted in same k value, OR operation succeeded")
} else {
t.Log("OR operation correctly failed with different parameters")
}
}
// TestRunMakeBloom 测试makebloom命令
func TestRunMakeBloom(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建测试数据文件
dataFile := filepath.Join(tempDir, "test_data.txt")
outputFile := filepath.Join(tempDir, "test_output.bmp")
testData := "apple\nbanana\ncherry\ndate\nelderberry\n"
err := os.WriteFile(dataFile, []byte(testData), 0644)
if err != nil {
t.Fatalf("Failed to create test data file: %v", err)
}
// 运行makebloom命令
err = RunMakeBloom("-d", dataFile, "-o", outputFile, "-e", "1000", "-r", "0.01")
if err != nil {
t.Fatalf("RunMakeBloom failed: %v", err)
}
// 验证输出文件存在
if _, err := os.Stat(outputFile); os.IsNotExist(err) {
t.Error("Output file was not created")
}
// 加载并验证bitmap
bf, err := bloom.LoadFromFile(outputFile, false)
if err != nil {
t.Fatalf("Failed to load created bitmap: %v", err)
}
// 验证数据
lines := strings.Split(strings.TrimSpace(testData), "\n")
for _, line := range lines {
if !bf.TestString(line) {
t.Errorf("Element %s not found in created bitmap", line)
}
}
}
// TestRunHitTest 测试hittest命令
func TestRunHitTest(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建测试bitmap
bitmapFile := filepath.Join(tempDir, "test.bmp")
dataFile := filepath.Join(tempDir, "test_data.txt")
resultFile := filepath.Join(tempDir, "test_result.txt")
// 创建原始数据
originalData := []string{"apple", "banana", "cherry", "date"}
// 创建bitmap
bf := bloom.NewWithEstimates(1000, 0.01)
for _, data := range originalData {
bf.AddString(data)
}
bf.Flush()
err := bf.SaveToFile(bitmapFile)
if err != nil {
t.Fatalf("Failed to save bitmap: %v", err)
}
// 创建测试数据文件
testData := "apple\nbanana\nelderberry\nfig\n"
err = os.WriteFile(dataFile, []byte(testData), 0644)
if err != nil {
t.Fatalf("Failed to create test data file: %v", err)
}
// 运行hittest命令
err = RunHitTest("-d", dataFile, "-b", bitmapFile, "-o", resultFile)
if err != nil {
t.Fatalf("RunHitTest failed: %v", err)
}
// 验证结果文件
resultContent, err := os.ReadFile(resultFile)
if err != nil {
t.Fatalf("Failed to read result file: %v", err)
}
lines := strings.Split(strings.TrimSpace(string(resultContent)), "\n")
// 应该找到apple和banana找不到elderberry和fig
expectedHits := 2
actualHits := 0
for _, line := range lines {
if strings.Contains(line, "apple") || strings.Contains(line, "banana") {
actualHits++
}
}
if actualHits != expectedHits {
t.Errorf("Expected %d hits, got %d", expectedHits, actualHits)
}
}
// TestRunInfo 测试info命令
func TestRunInfo(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
bitmapFile := filepath.Join(tempDir, "test.bmp")
// 创建测试bitmap
bf := bloom.NewWithEstimates(1000, 0.01)
bf.AddString("apple")
bf.AddString("banana")
err := bf.SaveToFile(bitmapFile)
if err != nil {
t.Fatalf("Failed to save bitmap: %v", err)
}
// 运行info命令这里只是测试不报错实际输出需要手动验证
err = RunInfo("-b", bitmapFile)
if err != nil {
t.Errorf("RunInfo failed: %v", err)
}
}
// TestRunAnd 测试and命令
func TestRunAnd(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建两个bitmap文件
bitmapFile1 := filepath.Join(tempDir, "test1.bmp")
bitmapFile2 := filepath.Join(tempDir, "test2.bmp")
outputFile := filepath.Join(tempDir, "and_result.bmp")
// 创建第一个bitmap
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf1.AddString("apple")
bf1.AddString("banana")
bf1.AddString("cherry")
bf1.Flush()
err := bf1.SaveToFile(bitmapFile1)
if err != nil {
t.Fatalf("Failed to save bitmap1: %v", err)
}
// 创建第二个bitmap
bf2 := bloom.NewWithEstimates(1000, 0.01)
bf2.AddString("banana")
bf2.AddString("cherry")
bf2.AddString("date")
bf2.Flush()
err = bf2.SaveToFile(bitmapFile2)
if err != nil {
t.Fatalf("Failed to save bitmap2: %v", err)
}
// 运行and命令
err = RunAnd("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile)
if err != nil {
t.Fatalf("RunAnd failed: %v", err)
}
// 验证结果
resultBf, err := bloom.LoadFromFile(outputFile, false)
if err != nil {
t.Fatalf("Failed to load result bitmap: %v", err)
}
// banana和cherry应该存在
if !resultBf.TestString("banana") || !resultBf.TestString("cherry") {
t.Error("Expected common elements missing from AND result")
}
}
// TestRunOr 测试or命令
func TestRunOr(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建两个bitmap文件
bitmapFile1 := filepath.Join(tempDir, "test1.bmp")
bitmapFile2 := filepath.Join(tempDir, "test2.bmp")
outputFile := filepath.Join(tempDir, "or_result.bmp")
// 创建第一个bitmap
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf1.AddString("apple")
bf1.AddString("banana")
bf1.Flush()
err := bf1.SaveToFile(bitmapFile1)
if err != nil {
t.Fatalf("Failed to save bitmap1: %v", err)
}
// 创建第二个bitmap
bf2 := bloom.NewWithEstimates(1000, 0.01)
bf2.AddString("cherry")
bf2.AddString("date")
bf2.Flush()
err = bf2.SaveToFile(bitmapFile2)
if err != nil {
t.Fatalf("Failed to save bitmap2: %v", err)
}
// 运行or命令
err = RunOr("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile)
if err != nil {
t.Fatalf("RunOr failed: %v", err)
}
// 验证结果
resultBf, err := bloom.LoadFromFile(outputFile, false)
if err != nil {
t.Fatalf("Failed to load result bitmap: %v", err)
}
// 所有元素应该存在
testElements := []string{"apple", "banana", "cherry", "date"}
for _, element := range testElements {
if !resultBf.TestString(element) {
t.Errorf("Element %s missing from OR result", element)
}
}
}
// TestBloomFilterEdgeCases 测试边界情况
func TestBloomFilterEdgeCases(t *testing.T) {
// 测试空字符串
bf := bloom.NewWithEstimates(100, 0.01)
bf.AddString("")
bf.Flush()
if !bf.TestString("") {
t.Error("Empty string should be testable")
}
// 测试长字符串
longString := strings.Repeat("a", 10000)
bf.AddString(longString)
bf.Flush()
if !bf.TestString(longString) {
t.Error("Long string should be testable")
}
// 测试特殊字符
specialString := "测试中文字符符!@#$%^&*()"
bf.AddString(specialString)
bf.Flush()
if !bf.TestString(specialString) {
t.Error("Special characters should be testable")
}
}
// TestBloomFilterConcurrency 测试并发安全性
func TestBloomFilterConcurrency(t *testing.T) {
bf := bloom.NewWithEstimates(10000, 0.01)
// 使用多个goroutine并发添加元素
done := make(chan bool, 10)
for i := 0; i < 10; i++ {
go func(id int) {
for j := 0; j < 100; j++ {
data := fmt.Sprintf("item_%d_%d", id, j)
bf.AddString(data)
}
done <- true
}(i)
}
// 等待所有goroutine完成
for i := 0; i < 10; i++ {
<-done
}
// 刷新所有缓冲区
bf.Flush()
// 验证所有元素都存在
for i := 0; i < 10; i++ {
for j := 0; j < 100; j++ {
data := fmt.Sprintf("item_%d_%d", i, j)
if !bf.TestString(data) {
t.Errorf("Concurrent element %s not found", data)
}
}
}
}
// BenchmarkBloomFilterAdd 性能测试:添加元素
func BenchmarkBloomFilterAdd(b *testing.B) {
bf := bloom.NewWithEstimates(1000000, 0.01)
b.ResetTimer()
for i := 0; i < b.N; i++ {
bf.AddString(fmt.Sprintf("item_%d", i))
}
bf.Flush()
}
// BenchmarkBloomFilterTest 性能测试:测试元素
func BenchmarkBloomFilterTest(b *testing.B) {
bf := bloom.NewWithEstimates(100000, 0.01)
// 预先添加一些元素
for i := 0; i < 1000; i++ {
bf.AddString(fmt.Sprintf("item_%d", i))
}
bf.Flush()
b.ResetTimer()
for i := 0; i < b.N; i++ {
bf.TestString(fmt.Sprintf("item_%d", i%1000))
}
}
// BenchmarkBloomFilterAndOperation 性能测试AND操作
func BenchmarkBloomFilterAndOperation(b *testing.B) {
bf1 := bloom.NewWithEstimates(10000, 0.01)
bf2 := bloom.NewWithEstimates(10000, 0.01)
// 预先添加元素
for i := 0; i < 1000; i++ {
bf1.AddString(fmt.Sprintf("item_%d", i))
bf2.AddString(fmt.Sprintf("item_%d", i))
}
bf1.Flush()
bf2.Flush()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := bf1.And(bf2)
if err != nil {
b.Fatal(err)
}
}
}
// BenchmarkBloomFilterOrOperation 性能测试OR操作
func BenchmarkBloomFilterOrOperation(b *testing.B) {
bf1 := bloom.NewWithEstimates(10000, 0.01)
bf2 := bloom.NewWithEstimates(10000, 0.01)
// 预先添加元素
for i := 0; i < 1000; i++ {
bf1.AddString(fmt.Sprintf("item_%d", i))
bf2.AddString(fmt.Sprintf("item_%d", i+1000))
}
bf1.Flush()
bf2.Flush()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := bf1.Or(bf2)
if err != nil {
b.Fatal(err)
}
}
}

View File

@@ -19,6 +19,8 @@ The commands are:
makebloom Make bloom filter bitmap file
hittest Hittest text lines in bitmap
info Show bitmap file info
and AND operation between two bitmap files
or OR operation between two bitmap files
"help" is the default command.

View File

@@ -15,18 +15,18 @@ func RunHitTest(args ...string) error {
txtFile := fs.String("d", "", "device id filename")
bmpFile := fs.String("b", "", "bitmap filename")
stateFile := fs.String("s", "", "state filename for output")
outStateFile := fs.String("o", "", "state filename for output")
filter := fs.Bool("f", false, "filter for hit only")
if err := fs.Parse(args); err != nil {
return err
} else if fs.NArg() > 0 || *txtFile == "" || *bmpFile == "" || *stateFile == "" {
} else if fs.NArg() > 0 || *txtFile == "" || *bmpFile == "" || *outStateFile == "" {
fmt.Println(fs.NArg())
fs.Usage()
return nil
}
return hitTest(*txtFile, *bmpFile, *stateFile, *filter)
return hitTest(*txtFile, *bmpFile, *outStateFile, *filter)
}
func hitTest(txtFile, bmpFile, stateFile string, filter bool) error {

View File

@@ -343,3 +343,55 @@ func LoadFromFile(filename string, headerOnly bool) (bft *BloomFilter, err error
return bft, nil
}
// And 两个布隆过滤器进行AND运算返回新的布隆过滤器
func (b *BloomFilter) And(other *BloomFilter) (*BloomFilter, error) {
// 检查参数一致性
if b.k != other.k || b.falsePositiveRate != other.falsePositiveRate {
return nil, errors.New("bloom filters must have same k and falsePositiveRate for AND operation")
}
// 创建新的布隆过滤器
result := newBloomFilter(b.m, b.k, b.elementsMax, b.falsePositiveRate)
// 执行AND运算
result.rb = roaring64.And(b.rb, other.rb)
// 计算AND后的元素数量这是一个估计值因为AND操作后元素数量无法精确计算
// 使用两个过滤器中较小的元素数量作为估计
if b.elementsAdded < other.elementsAdded {
result.elementsAdded = b.elementsAdded
} else {
result.elementsAdded = other.elementsAdded
}
return result, nil
}
// Or 两个布隆过滤器进行OR运算返回新的布隆过滤器
func (b *BloomFilter) Or(other *BloomFilter) (*BloomFilter, error) {
// 检查参数一致性
if b.k != other.k || b.falsePositiveRate != other.falsePositiveRate {
return nil, errors.New("bloom filters must have same k and falsePositiveRate for OR operation")
}
// 创建新的布隆过滤器
result := newBloomFilter(b.m, b.k, b.elementsMax, b.falsePositiveRate)
// 执行OR运算
result.rb = roaring64.Or(b.rb, other.rb)
// 计算OR后的元素数量这是一个估计值
// 使用两个过滤器中较大的元素数量作为估计,但不能超过最大容量
if b.elementsAdded > other.elementsAdded {
result.elementsAdded = b.elementsAdded
} else {
result.elementsAdded = other.elementsAdded
}
if result.elementsAdded > result.elementsMax {
result.elementsAdded = result.elementsMax
}
return result, nil
}

View File

@@ -25,6 +25,10 @@ func Run(args ...string) error {
return RunHitTest(args...)
case "info":
return RunInfo(args...)
case "and":
return RunAnd(args...)
case "or":
return RunOr(args...)
default:
err := fmt.Errorf(`unknown command "%s"`+"\n"+`Run 'bloomtool help' for usage`, name)
slog.Warn(err.Error())

View File

@@ -17,23 +17,23 @@ func RunMakeBloom(args ...string) error {
fs := flag.NewFlagSet("makebloom", flag.ExitOnError)
txtFile := fs.String("d", "", "device id filename")
bmpFile := fs.String("b", "", "bitmap filename for output")
outFile := fs.String("o", "", "output bitmap filename")
elements := fs.Uint64("e", 0, "max elements. (max 100 0000 0000). if 0 then auto")
falseRate := fs.Float64("r", FalseRate, "false rate (0.01--0.0000 0000 1)")
if err := fs.Parse(args); err != nil {
return err
} else if fs.NArg() > 0 || *txtFile == "" || *bmpFile == "" ||
} else if fs.NArg() > 0 || *txtFile == "" || *outFile == "" ||
*elements > 10000000000 ||
*falseRate > 0.01 || *falseRate < 0.000000001 {
fs.Usage()
return nil
}
return makeBloom(*txtFile, *bmpFile, *elements, *falseRate)
return makeBloom(*txtFile, *outFile, *elements, *falseRate)
}
func makeBloom(txtFile string, bmpFile string, elements uint64, falseRate float64) error {
func makeBloom(txtFile string, outFile string, elements uint64, falseRate float64) error {
// 打开设备号文件
slog.Info("open source file", "filename", txtFile)
tfile, err := os.Open(txtFile)
@@ -76,8 +76,8 @@ func makeBloom(txtFile string, bmpFile string, elements uint64, falseRate float6
}
// 保存文件
slog.Info("save bitmap file", "filename", bmpFile)
err = bloombmp.SaveToFile(bmpFile)
slog.Info("save bitmap file", "filename", outFile)
err = bloombmp.SaveToFile(outFile)
if err != nil {
slog.Error("save bitmap file error", "err", err)

81
or.go Normal file
View File

@@ -0,0 +1,81 @@
package main
import (
"flag"
"log/slog"
"git.algo.com.cn/public/bloomtool/internal/bloom"
)
func RunOr(args ...string) error {
fs := flag.NewFlagSet("or", flag.ExitOnError)
bmpFile1 := fs.String("b1", "", "first bitmap filename")
bmpFile2 := fs.String("b2", "", "second bitmap filename")
outFile := fs.String("o", "", "output bitmap filename")
if err := fs.Parse(args); err != nil {
return err
} else if fs.NArg() > 0 || *bmpFile1 == "" || *bmpFile2 == "" || *outFile == "" {
fs.Usage()
return nil
}
return orOperation(*bmpFile1, *bmpFile2, *outFile)
}
func orOperation(bmpFile1, bmpFile2, outFile string) error {
// 加载第一个bitmap文件
slog.Info("load first bitmap file", "filename", bmpFile1)
bf1, err := bloom.LoadFromFile(bmpFile1, false)
if err != nil {
slog.Error("load first bitmap file error", "err", err)
return err
}
// 加载第二个bitmap文件
slog.Info("load second bitmap file", "filename", bmpFile2)
bf2, err := bloom.LoadFromFile(bmpFile2, false)
if err != nil {
slog.Error("load second bitmap file error", "err", err)
return err
}
// 检查参数一致性
stat1 := bf1.GetStat()
stat2 := bf2.GetStat()
slog.Info("first bitmap info", "elements", stat1.ElementsMax, "falseRate", stat1.FalsePositiveRate)
slog.Info("second bitmap info", "elements", stat2.ElementsMax, "falseRate", stat2.FalsePositiveRate)
if stat1.ElementsMax != stat2.ElementsMax {
slog.Error("elements count mismatch", "file1", stat1.ElementsMax, "file2", stat2.ElementsMax)
return flag.ErrHelp
}
if stat1.FalsePositiveRate != stat2.FalsePositiveRate {
slog.Error("false positive rate mismatch", "file1", stat1.FalsePositiveRate, "file2", stat2.FalsePositiveRate)
return flag.ErrHelp
}
// 执行OR运算
slog.Info("perform OR operation")
result, err := bf1.Or(bf2)
if err != nil {
slog.Error("OR operation error", "err", err)
return err
}
// 保存结果
slog.Info("save result bitmap file", "filename", outFile)
err = result.SaveToFile(outFile)
if err != nil {
slog.Error("save result bitmap file error", "err", err)
return err
}
resultStat := result.GetStat()
slog.Info("OR operation completed", "resultElements", resultStat.ElementsAdded)
return nil
}