增加交并计算功能

2025-11-05 16:41:06 +08:00
parent 869bae0a9e
commit 44d9206b9f
10 changed files with 950 additions and 12 deletions
--- a/bitmap_test.go
+++ b/bitmap_test.go
@@ -0,0 +1,645 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"git.algo.com.cn/public/bloomtool/internal/bloom"
+)
+
+// TestBloomFilterBasic 测试布隆过滤器基本功能
+func TestBloomFilterBasic(t *testing.T) {
+	// 创建布隆过滤器
+	bf := bloom.NewWithEstimates(1000, 0.01)
+
+	// 测试添加元素
+	testData := []string{"apple", "banana", "cherry", "date"}
+	for _, data := range testData {
+		bf.AddString(data)
+	}
+
+	// 刷新缓冲区确保所有元素都被处理
+	bf.Flush()
+
+	// 测试存在的元素
+	for _, data := range testData {
+		if !bf.TestString(data) {
+			t.Errorf("Expected element %s to be present", data)
+		}
+	}
+
+	// 测试不存在的元素（可能有假阳性，但不应该总是假阳性）
+	nonExistentData := []string{"elderberry", "fig", "grape"}
+	falsePositives := 0
+	for _, data := range nonExistentData {
+		if bf.TestString(data) {
+			falsePositives++
+		}
+	}
+
+	// 假阳性率应该合理
+	falsePositiveRate := float64(falsePositives) / float64(len(nonExistentData))
+	if falsePositiveRate > 0.1 { // 允许一定的假阳性率
+		t.Errorf("False positive rate too high: %f", falsePositiveRate)
+	}
+}
+
+// TestBloomFilterStatistics 测试布隆过滤器统计信息
+func TestBloomFilterStatistics(t *testing.T) {
+	bf := bloom.NewWithEstimates(1000, 0.01)
+
+	// 初始状态
+	stat := bf.GetStat()
+	if stat.ElementsAdded != 0 {
+		t.Errorf("Expected 0 elements added, got %d", stat.ElementsAdded)
+	}
+	if stat.ElementsMax != 1000 {
+		t.Errorf("Expected 1000 max elements, got %d", stat.ElementsMax)
+	}
+
+	// 添加元素
+	testData := []string{"apple", "banana", "cherry"}
+	for _, data := range testData {
+		bf.AddString(data)
+	}
+
+	// 刷新缓冲区
+	bf.Flush()
+
+	// 添加后的状态
+	stat = bf.GetStat()
+	if stat.ElementsAdded != uint64(len(testData)) {
+		t.Errorf("Expected %d elements added, got %d", len(testData), stat.ElementsAdded)
+	}
+}
+
+// TestBloomFilterSaveAndLoad 测试文件保存和加载
+func TestBloomFilterSaveAndLoad(t *testing.T) {
+	// 创建临时目录
+	tempDir := t.TempDir()
+	testFile := filepath.Join(tempDir, "test.bmp")
+
+	// 创建布隆过滤器并添加数据
+	originalBf := bloom.NewWithEstimates(1000, 0.01)
+	testData := []string{"apple", "banana", "cherry", "date", "elderberry"}
+	for _, data := range testData {
+		originalBf.AddString(data)
+	}
+
+	// 确保所有元素都被处理
+	originalBf.Flush()
+
+	// 保存到文件
+	err := originalBf.SaveToFile(testFile)
+	if err != nil {
+		t.Fatalf("Failed to save bloom filter: %v", err)
+	}
+
+	// 从文件加载
+	loadedBf, err := bloom.LoadFromFile(testFile, false)
+	if err != nil {
+		t.Fatalf("Failed to load bloom filter: %v", err)
+	}
+
+	// 验证加载的数据
+	originalStat := originalBf.GetStat()
+	loadedStat := loadedBf.GetStat()
+
+	if originalStat.ElementsAdded != loadedStat.ElementsAdded {
+		t.Errorf("Elements count mismatch: original=%d, loaded=%d",
+			originalStat.ElementsAdded, loadedStat.ElementsAdded)
+	}
+	if originalStat.ElementsMax != loadedStat.ElementsMax {
+		t.Errorf("Max elements mismatch: original=%d, loaded=%d",
+			originalStat.ElementsMax, loadedStat.ElementsMax)
+	}
+
+	// 验证数据一致性
+	for _, data := range testData {
+		if !loadedBf.TestString(data) {
+			t.Errorf("Loaded bloom filter missing element: %s", data)
+		}
+	}
+}
+
+// TestBloomFilterAndOperation 测试AND操作
+func TestBloomFilterAndOperation(t *testing.T) {
+	// 创建两个布隆过滤器
+	bf1 := bloom.NewWithEstimates(1000, 0.01)
+	bf2 := bloom.NewWithEstimates(1000, 0.01)
+
+	// 添加数据
+	commonData := []string{"apple", "banana"}  // 共同元素
+	onlyInBf1 := []string{"cherry", "date"}    // 只在bf1中
+	onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中
+
+	for _, data := range commonData {
+		bf1.AddString(data)
+		bf2.AddString(data)
+	}
+
+	for _, data := range onlyInBf1 {
+		bf1.AddString(data)
+	}
+
+	for _, data := range onlyInBf2 {
+		bf2.AddString(data)
+	}
+
+	// 刷新缓冲区
+	bf1.Flush()
+	bf2.Flush()
+
+	// 执行AND操作
+	result, err := bf1.And(bf2)
+	if err != nil {
+		t.Fatalf("AND operation failed: %v", err)
+	}
+
+	// 验证结果：共同元素应该存在
+	missingCommon := false
+	for _, data := range commonData {
+		if !result.TestString(data) {
+			missingCommon = true
+			t.Logf("Common element %s missing from AND result", data)
+		}
+	}
+
+	// 由于布隆过滤器的假阳性特性，我们检查是否有共同元素存在
+	if missingCommon {
+		t.Error("Not all common elements found in AND result")
+	}
+
+	// 验证AND操作确实减少了某些元素
+	bf1Test := bf1.TestString("cherry")
+	resultTest := result.TestString("cherry")
+
+	if bf1Test && !resultTest {
+		t.Log("AND operation correctly removed element present in only one filter")
+	}
+}
+
+// TestBloomFilterOrOperation 测试OR操作
+func TestBloomFilterOrOperation(t *testing.T) {
+	// 创建两个布隆过滤器
+	bf1 := bloom.NewWithEstimates(1000, 0.01)
+	bf2 := bloom.NewWithEstimates(1000, 0.01)
+
+	// 添加数据
+	commonData := []string{"apple", "banana"}  // 共同元素
+	onlyInBf1 := []string{"cherry", "date"}    // 只在bf1中
+	onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中
+
+	allData := append(append(commonData, onlyInBf1...), onlyInBf2...)
+
+	for _, data := range commonData {
+		bf1.AddString(data)
+		bf2.AddString(data)
+	}
+
+	for _, data := range onlyInBf1 {
+		bf1.AddString(data)
+	}
+
+	for _, data := range onlyInBf2 {
+		bf2.AddString(data)
+	}
+
+	// 刷新缓冲区
+	bf1.Flush()
+	bf2.Flush()
+
+	// 执行OR操作
+	result, err := bf1.Or(bf2)
+	if err != nil {
+		t.Fatalf("OR operation failed: %v", err)
+	}
+
+	// 验证结果：所有元素应该存在
+	missingElements := 0
+	for _, data := range allData {
+		if !result.TestString(data) {
+			missingElements++
+			t.Logf("Element %s missing from OR result", data)
+		}
+	}
+
+	// 由于布隆过滤器的特性，允许少量元素缺失
+	if missingElements > len(allData)/2 {
+		t.Errorf("Too many elements (%d/%d) missing from OR result", missingElements, len(allData))
+	}
+
+	// 验证OR操作确实增加了某些元素
+	resultTest := result.TestString("cherry")
+	bf1Test := bf1.TestString("cherry")
+
+	if bf1Test && resultTest {
+		t.Log("OR operation correctly preserved element from first filter")
+	}
+}
+
+// TestBloomFilterOperationErrors 测试操作错误处理
+func TestBloomFilterOperationErrors(t *testing.T) {
+	// 创建两个不兼容的布隆过滤器（不同的误判率）
+	bf1 := bloom.NewWithEstimates(1000, 0.01)
+	bf2 := bloom.NewWithEstimates(1000, 0.02) // 不同的误判率
+
+	// 测试AND操作错误
+	_, err := bf1.And(bf2)
+	if err == nil {
+		t.Error("Expected AND operation to fail with different false positive rates")
+	}
+
+	// 测试OR操作错误
+	_, err = bf1.Or(bf2)
+	if err == nil {
+		t.Error("Expected OR operation to fail with different false positive rates")
+	}
+
+	// 创建不同参数数量的过滤器（这会导致不同的k值）
+	bf3 := bloom.NewWithEstimates(2000, 0.01) // 不同的最大元素数
+
+	// 测试AND操作错误
+	_, err = bf1.And(bf3)
+	if err == nil {
+		t.Log("Note: Different element counts resulted in same k value, AND operation succeeded")
+	} else {
+		t.Log("AND operation correctly failed with different parameters")
+	}
+
+	// 测试OR操作错误
+	_, err = bf1.Or(bf3)
+	if err == nil {
+		t.Log("Note: Different element counts resulted in same k value, OR operation succeeded")
+	} else {
+		t.Log("OR operation correctly failed with different parameters")
+	}
+}
+
+// TestRunMakeBloom 测试makebloom命令
+func TestRunMakeBloom(t *testing.T) {
+	// 创建临时目录
+	tempDir := t.TempDir()
+
+	// 创建测试数据文件
+	dataFile := filepath.Join(tempDir, "test_data.txt")
+	outputFile := filepath.Join(tempDir, "test_output.bmp")
+
+	testData := "apple\nbanana\ncherry\ndate\nelderberry\n"
+	err := os.WriteFile(dataFile, []byte(testData), 0644)
+	if err != nil {
+		t.Fatalf("Failed to create test data file: %v", err)
+	}
+
+	// 运行makebloom命令
+	err = RunMakeBloom("-d", dataFile, "-o", outputFile, "-e", "1000", "-r", "0.01")
+	if err != nil {
+		t.Fatalf("RunMakeBloom failed: %v", err)
+	}
+
+	// 验证输出文件存在
+	if _, err := os.Stat(outputFile); os.IsNotExist(err) {
+		t.Error("Output file was not created")
+	}
+
+	// 加载并验证bitmap
+	bf, err := bloom.LoadFromFile(outputFile, false)
+	if err != nil {
+		t.Fatalf("Failed to load created bitmap: %v", err)
+	}
+
+	// 验证数据
+	lines := strings.Split(strings.TrimSpace(testData), "\n")
+	for _, line := range lines {
+		if !bf.TestString(line) {
+			t.Errorf("Element %s not found in created bitmap", line)
+		}
+	}
+}
+
+// TestRunHitTest 测试hittest命令
+func TestRunHitTest(t *testing.T) {
+	// 创建临时目录
+	tempDir := t.TempDir()
+
+	// 创建测试bitmap
+	bitmapFile := filepath.Join(tempDir, "test.bmp")
+	dataFile := filepath.Join(tempDir, "test_data.txt")
+	resultFile := filepath.Join(tempDir, "test_result.txt")
+
+	// 创建原始数据
+	originalData := []string{"apple", "banana", "cherry", "date"}
+
+	// 创建bitmap
+	bf := bloom.NewWithEstimates(1000, 0.01)
+	for _, data := range originalData {
+		bf.AddString(data)
+	}
+	bf.Flush()
+
+	err := bf.SaveToFile(bitmapFile)
+	if err != nil {
+		t.Fatalf("Failed to save bitmap: %v", err)
+	}
+
+	// 创建测试数据文件
+	testData := "apple\nbanana\nelderberry\nfig\n"
+	err = os.WriteFile(dataFile, []byte(testData), 0644)
+	if err != nil {
+		t.Fatalf("Failed to create test data file: %v", err)
+	}
+
+	// 运行hittest命令
+	err = RunHitTest("-d", dataFile, "-b", bitmapFile, "-o", resultFile)
+	if err != nil {
+		t.Fatalf("RunHitTest failed: %v", err)
+	}
+
+	// 验证结果文件
+	resultContent, err := os.ReadFile(resultFile)
+	if err != nil {
+		t.Fatalf("Failed to read result file: %v", err)
+	}
+
+	lines := strings.Split(strings.TrimSpace(string(resultContent)), "\n")
+
+	// 应该找到apple和banana，找不到elderberry和fig
+	expectedHits := 2
+	actualHits := 0
+	for _, line := range lines {
+		if strings.Contains(line, "apple") || strings.Contains(line, "banana") {
+			actualHits++
+		}
+	}
+
+	if actualHits != expectedHits {
+		t.Errorf("Expected %d hits, got %d", expectedHits, actualHits)
+	}
+}
+
+// TestRunInfo 测试info命令
+func TestRunInfo(t *testing.T) {
+	// 创建临时目录
+	tempDir := t.TempDir()
+	bitmapFile := filepath.Join(tempDir, "test.bmp")
+
+	// 创建测试bitmap
+	bf := bloom.NewWithEstimates(1000, 0.01)
+	bf.AddString("apple")
+	bf.AddString("banana")
+
+	err := bf.SaveToFile(bitmapFile)
+	if err != nil {
+		t.Fatalf("Failed to save bitmap: %v", err)
+	}
+
+	// 运行info命令（这里只是测试不报错，实际输出需要手动验证）
+	err = RunInfo("-b", bitmapFile)
+	if err != nil {
+		t.Errorf("RunInfo failed: %v", err)
+	}
+}
+
+// TestRunAnd 测试and命令
+func TestRunAnd(t *testing.T) {
+	// 创建临时目录
+	tempDir := t.TempDir()
+
+	// 创建两个bitmap文件
+	bitmapFile1 := filepath.Join(tempDir, "test1.bmp")
+	bitmapFile2 := filepath.Join(tempDir, "test2.bmp")
+	outputFile := filepath.Join(tempDir, "and_result.bmp")
+
+	// 创建第一个bitmap
+	bf1 := bloom.NewWithEstimates(1000, 0.01)
+	bf1.AddString("apple")
+	bf1.AddString("banana")
+	bf1.AddString("cherry")
+	bf1.Flush()
+
+	err := bf1.SaveToFile(bitmapFile1)
+	if err != nil {
+		t.Fatalf("Failed to save bitmap1: %v", err)
+	}
+
+	// 创建第二个bitmap
+	bf2 := bloom.NewWithEstimates(1000, 0.01)
+	bf2.AddString("banana")
+	bf2.AddString("cherry")
+	bf2.AddString("date")
+	bf2.Flush()
+
+	err = bf2.SaveToFile(bitmapFile2)
+	if err != nil {
+		t.Fatalf("Failed to save bitmap2: %v", err)
+	}
+
+	// 运行and命令
+	err = RunAnd("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile)
+	if err != nil {
+		t.Fatalf("RunAnd failed: %v", err)
+	}
+
+	// 验证结果
+	resultBf, err := bloom.LoadFromFile(outputFile, false)
+	if err != nil {
+		t.Fatalf("Failed to load result bitmap: %v", err)
+	}
+
+	// banana和cherry应该存在
+	if !resultBf.TestString("banana") || !resultBf.TestString("cherry") {
+		t.Error("Expected common elements missing from AND result")
+	}
+}
+
+// TestRunOr 测试or命令
+func TestRunOr(t *testing.T) {
+	// 创建临时目录
+	tempDir := t.TempDir()
+
+	// 创建两个bitmap文件
+	bitmapFile1 := filepath.Join(tempDir, "test1.bmp")
+	bitmapFile2 := filepath.Join(tempDir, "test2.bmp")
+	outputFile := filepath.Join(tempDir, "or_result.bmp")
+
+	// 创建第一个bitmap
+	bf1 := bloom.NewWithEstimates(1000, 0.01)
+	bf1.AddString("apple")
+	bf1.AddString("banana")
+	bf1.Flush()
+
+	err := bf1.SaveToFile(bitmapFile1)
+	if err != nil {
+		t.Fatalf("Failed to save bitmap1: %v", err)
+	}
+
+	// 创建第二个bitmap
+	bf2 := bloom.NewWithEstimates(1000, 0.01)
+	bf2.AddString("cherry")
+	bf2.AddString("date")
+	bf2.Flush()
+
+	err = bf2.SaveToFile(bitmapFile2)
+	if err != nil {
+		t.Fatalf("Failed to save bitmap2: %v", err)
+	}
+
+	// 运行or命令
+	err = RunOr("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile)
+	if err != nil {
+		t.Fatalf("RunOr failed: %v", err)
+	}
+
+	// 验证结果
+	resultBf, err := bloom.LoadFromFile(outputFile, false)
+	if err != nil {
+		t.Fatalf("Failed to load result bitmap: %v", err)
+	}
+
+	// 所有元素应该存在
+	testElements := []string{"apple", "banana", "cherry", "date"}
+	for _, element := range testElements {
+		if !resultBf.TestString(element) {
+			t.Errorf("Element %s missing from OR result", element)
+		}
+	}
+}
+
+// TestBloomFilterEdgeCases 测试边界情况
+func TestBloomFilterEdgeCases(t *testing.T) {
+	// 测试空字符串
+	bf := bloom.NewWithEstimates(100, 0.01)
+	bf.AddString("")
+	bf.Flush()
+
+	if !bf.TestString("") {
+		t.Error("Empty string should be testable")
+	}
+
+	// 测试长字符串
+	longString := strings.Repeat("a", 10000)
+	bf.AddString(longString)
+	bf.Flush()
+
+	if !bf.TestString(longString) {
+		t.Error("Long string should be testable")
+	}
+
+	// 测试特殊字符
+	specialString := "测试中文字符符!@#$%^&*()"
+	bf.AddString(specialString)
+	bf.Flush()
+
+	if !bf.TestString(specialString) {
+		t.Error("Special characters should be testable")
+	}
+}
+
+// TestBloomFilterConcurrency 测试并发安全性
+func TestBloomFilterConcurrency(t *testing.T) {
+	bf := bloom.NewWithEstimates(10000, 0.01)
+
+	// 使用多个goroutine并发添加元素
+	done := make(chan bool, 10)
+	for i := 0; i < 10; i++ {
+		go func(id int) {
+			for j := 0; j < 100; j++ {
+				data := fmt.Sprintf("item_%d_%d", id, j)
+				bf.AddString(data)
+			}
+			done <- true
+		}(i)
+	}
+
+	// 等待所有goroutine完成
+	for i := 0; i < 10; i++ {
+		<-done
+	}
+
+	// 刷新所有缓冲区
+	bf.Flush()
+
+	// 验证所有元素都存在
+	for i := 0; i < 10; i++ {
+		for j := 0; j < 100; j++ {
+			data := fmt.Sprintf("item_%d_%d", i, j)
+			if !bf.TestString(data) {
+				t.Errorf("Concurrent element %s not found", data)
+			}
+		}
+	}
+}
+
+// BenchmarkBloomFilterAdd 性能测试：添加元素
+func BenchmarkBloomFilterAdd(b *testing.B) {
+	bf := bloom.NewWithEstimates(1000000, 0.01)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		bf.AddString(fmt.Sprintf("item_%d", i))
+	}
+	bf.Flush()
+}
+
+// BenchmarkBloomFilterTest 性能测试：测试元素
+func BenchmarkBloomFilterTest(b *testing.B) {
+	bf := bloom.NewWithEstimates(100000, 0.01)
+
+	// 预先添加一些元素
+	for i := 0; i < 1000; i++ {
+		bf.AddString(fmt.Sprintf("item_%d", i))
+	}
+	bf.Flush()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		bf.TestString(fmt.Sprintf("item_%d", i%1000))
+	}
+}
+
+// BenchmarkBloomFilterAndOperation 性能测试：AND操作
+func BenchmarkBloomFilterAndOperation(b *testing.B) {
+	bf1 := bloom.NewWithEstimates(10000, 0.01)
+	bf2 := bloom.NewWithEstimates(10000, 0.01)
+
+	// 预先添加元素
+	for i := 0; i < 1000; i++ {
+		bf1.AddString(fmt.Sprintf("item_%d", i))
+		bf2.AddString(fmt.Sprintf("item_%d", i))
+	}
+	bf1.Flush()
+	bf2.Flush()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := bf1.And(bf2)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+// BenchmarkBloomFilterOrOperation 性能测试：OR操作
+func BenchmarkBloomFilterOrOperation(b *testing.B) {
+	bf1 := bloom.NewWithEstimates(10000, 0.01)
+	bf2 := bloom.NewWithEstimates(10000, 0.01)
+
+	// 预先添加元素
+	for i := 0; i < 1000; i++ {
+		bf1.AddString(fmt.Sprintf("item_%d", i))
+		bf2.AddString(fmt.Sprintf("item_%d", i+1000))
+	}
+	bf1.Flush()
+	bf2.Flush()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_, err := bf1.Or(bf2)
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+}