增加交并计算功能

This commit is contained in:
algotao
2025-11-05 16:41:06 +08:00
parent 869bae0a9e
commit 44d9206b9f
10 changed files with 950 additions and 12 deletions

645
bitmap_test.go Normal file
View File

@@ -0,0 +1,645 @@
package main
import (
"fmt"
"os"
"path/filepath"
"strings"
"testing"
"git.algo.com.cn/public/bloomtool/internal/bloom"
)
// TestBloomFilterBasic 测试布隆过滤器基本功能
func TestBloomFilterBasic(t *testing.T) {
// 创建布隆过滤器
bf := bloom.NewWithEstimates(1000, 0.01)
// 测试添加元素
testData := []string{"apple", "banana", "cherry", "date"}
for _, data := range testData {
bf.AddString(data)
}
// 刷新缓冲区确保所有元素都被处理
bf.Flush()
// 测试存在的元素
for _, data := range testData {
if !bf.TestString(data) {
t.Errorf("Expected element %s to be present", data)
}
}
// 测试不存在的元素(可能有假阳性,但不应该总是假阳性)
nonExistentData := []string{"elderberry", "fig", "grape"}
falsePositives := 0
for _, data := range nonExistentData {
if bf.TestString(data) {
falsePositives++
}
}
// 假阳性率应该合理
falsePositiveRate := float64(falsePositives) / float64(len(nonExistentData))
if falsePositiveRate > 0.1 { // 允许一定的假阳性率
t.Errorf("False positive rate too high: %f", falsePositiveRate)
}
}
// TestBloomFilterStatistics 测试布隆过滤器统计信息
func TestBloomFilterStatistics(t *testing.T) {
bf := bloom.NewWithEstimates(1000, 0.01)
// 初始状态
stat := bf.GetStat()
if stat.ElementsAdded != 0 {
t.Errorf("Expected 0 elements added, got %d", stat.ElementsAdded)
}
if stat.ElementsMax != 1000 {
t.Errorf("Expected 1000 max elements, got %d", stat.ElementsMax)
}
// 添加元素
testData := []string{"apple", "banana", "cherry"}
for _, data := range testData {
bf.AddString(data)
}
// 刷新缓冲区
bf.Flush()
// 添加后的状态
stat = bf.GetStat()
if stat.ElementsAdded != uint64(len(testData)) {
t.Errorf("Expected %d elements added, got %d", len(testData), stat.ElementsAdded)
}
}
// TestBloomFilterSaveAndLoad 测试文件保存和加载
func TestBloomFilterSaveAndLoad(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
testFile := filepath.Join(tempDir, "test.bmp")
// 创建布隆过滤器并添加数据
originalBf := bloom.NewWithEstimates(1000, 0.01)
testData := []string{"apple", "banana", "cherry", "date", "elderberry"}
for _, data := range testData {
originalBf.AddString(data)
}
// 确保所有元素都被处理
originalBf.Flush()
// 保存到文件
err := originalBf.SaveToFile(testFile)
if err != nil {
t.Fatalf("Failed to save bloom filter: %v", err)
}
// 从文件加载
loadedBf, err := bloom.LoadFromFile(testFile, false)
if err != nil {
t.Fatalf("Failed to load bloom filter: %v", err)
}
// 验证加载的数据
originalStat := originalBf.GetStat()
loadedStat := loadedBf.GetStat()
if originalStat.ElementsAdded != loadedStat.ElementsAdded {
t.Errorf("Elements count mismatch: original=%d, loaded=%d",
originalStat.ElementsAdded, loadedStat.ElementsAdded)
}
if originalStat.ElementsMax != loadedStat.ElementsMax {
t.Errorf("Max elements mismatch: original=%d, loaded=%d",
originalStat.ElementsMax, loadedStat.ElementsMax)
}
// 验证数据一致性
for _, data := range testData {
if !loadedBf.TestString(data) {
t.Errorf("Loaded bloom filter missing element: %s", data)
}
}
}
// TestBloomFilterAndOperation 测试AND操作
func TestBloomFilterAndOperation(t *testing.T) {
// 创建两个布隆过滤器
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf2 := bloom.NewWithEstimates(1000, 0.01)
// 添加数据
commonData := []string{"apple", "banana"} // 共同元素
onlyInBf1 := []string{"cherry", "date"} // 只在bf1中
onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中
for _, data := range commonData {
bf1.AddString(data)
bf2.AddString(data)
}
for _, data := range onlyInBf1 {
bf1.AddString(data)
}
for _, data := range onlyInBf2 {
bf2.AddString(data)
}
// 刷新缓冲区
bf1.Flush()
bf2.Flush()
// 执行AND操作
result, err := bf1.And(bf2)
if err != nil {
t.Fatalf("AND operation failed: %v", err)
}
// 验证结果:共同元素应该存在
missingCommon := false
for _, data := range commonData {
if !result.TestString(data) {
missingCommon = true
t.Logf("Common element %s missing from AND result", data)
}
}
// 由于布隆过滤器的假阳性特性,我们检查是否有共同元素存在
if missingCommon {
t.Error("Not all common elements found in AND result")
}
// 验证AND操作确实减少了某些元素
bf1Test := bf1.TestString("cherry")
resultTest := result.TestString("cherry")
if bf1Test && !resultTest {
t.Log("AND operation correctly removed element present in only one filter")
}
}
// TestBloomFilterOrOperation 测试OR操作
func TestBloomFilterOrOperation(t *testing.T) {
// 创建两个布隆过滤器
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf2 := bloom.NewWithEstimates(1000, 0.01)
// 添加数据
commonData := []string{"apple", "banana"} // 共同元素
onlyInBf1 := []string{"cherry", "date"} // 只在bf1中
onlyInBf2 := []string{"elderberry", "fig"} // 只在bf2中
allData := append(append(commonData, onlyInBf1...), onlyInBf2...)
for _, data := range commonData {
bf1.AddString(data)
bf2.AddString(data)
}
for _, data := range onlyInBf1 {
bf1.AddString(data)
}
for _, data := range onlyInBf2 {
bf2.AddString(data)
}
// 刷新缓冲区
bf1.Flush()
bf2.Flush()
// 执行OR操作
result, err := bf1.Or(bf2)
if err != nil {
t.Fatalf("OR operation failed: %v", err)
}
// 验证结果:所有元素应该存在
missingElements := 0
for _, data := range allData {
if !result.TestString(data) {
missingElements++
t.Logf("Element %s missing from OR result", data)
}
}
// 由于布隆过滤器的特性,允许少量元素缺失
if missingElements > len(allData)/2 {
t.Errorf("Too many elements (%d/%d) missing from OR result", missingElements, len(allData))
}
// 验证OR操作确实增加了某些元素
resultTest := result.TestString("cherry")
bf1Test := bf1.TestString("cherry")
if bf1Test && resultTest {
t.Log("OR operation correctly preserved element from first filter")
}
}
// TestBloomFilterOperationErrors 测试操作错误处理
func TestBloomFilterOperationErrors(t *testing.T) {
// 创建两个不兼容的布隆过滤器(不同的误判率)
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf2 := bloom.NewWithEstimates(1000, 0.02) // 不同的误判率
// 测试AND操作错误
_, err := bf1.And(bf2)
if err == nil {
t.Error("Expected AND operation to fail with different false positive rates")
}
// 测试OR操作错误
_, err = bf1.Or(bf2)
if err == nil {
t.Error("Expected OR operation to fail with different false positive rates")
}
// 创建不同参数数量的过滤器这会导致不同的k值
bf3 := bloom.NewWithEstimates(2000, 0.01) // 不同的最大元素数
// 测试AND操作错误
_, err = bf1.And(bf3)
if err == nil {
t.Log("Note: Different element counts resulted in same k value, AND operation succeeded")
} else {
t.Log("AND operation correctly failed with different parameters")
}
// 测试OR操作错误
_, err = bf1.Or(bf3)
if err == nil {
t.Log("Note: Different element counts resulted in same k value, OR operation succeeded")
} else {
t.Log("OR operation correctly failed with different parameters")
}
}
// TestRunMakeBloom 测试makebloom命令
func TestRunMakeBloom(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建测试数据文件
dataFile := filepath.Join(tempDir, "test_data.txt")
outputFile := filepath.Join(tempDir, "test_output.bmp")
testData := "apple\nbanana\ncherry\ndate\nelderberry\n"
err := os.WriteFile(dataFile, []byte(testData), 0644)
if err != nil {
t.Fatalf("Failed to create test data file: %v", err)
}
// 运行makebloom命令
err = RunMakeBloom("-d", dataFile, "-o", outputFile, "-e", "1000", "-r", "0.01")
if err != nil {
t.Fatalf("RunMakeBloom failed: %v", err)
}
// 验证输出文件存在
if _, err := os.Stat(outputFile); os.IsNotExist(err) {
t.Error("Output file was not created")
}
// 加载并验证bitmap
bf, err := bloom.LoadFromFile(outputFile, false)
if err != nil {
t.Fatalf("Failed to load created bitmap: %v", err)
}
// 验证数据
lines := strings.Split(strings.TrimSpace(testData), "\n")
for _, line := range lines {
if !bf.TestString(line) {
t.Errorf("Element %s not found in created bitmap", line)
}
}
}
// TestRunHitTest 测试hittest命令
func TestRunHitTest(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建测试bitmap
bitmapFile := filepath.Join(tempDir, "test.bmp")
dataFile := filepath.Join(tempDir, "test_data.txt")
resultFile := filepath.Join(tempDir, "test_result.txt")
// 创建原始数据
originalData := []string{"apple", "banana", "cherry", "date"}
// 创建bitmap
bf := bloom.NewWithEstimates(1000, 0.01)
for _, data := range originalData {
bf.AddString(data)
}
bf.Flush()
err := bf.SaveToFile(bitmapFile)
if err != nil {
t.Fatalf("Failed to save bitmap: %v", err)
}
// 创建测试数据文件
testData := "apple\nbanana\nelderberry\nfig\n"
err = os.WriteFile(dataFile, []byte(testData), 0644)
if err != nil {
t.Fatalf("Failed to create test data file: %v", err)
}
// 运行hittest命令
err = RunHitTest("-d", dataFile, "-b", bitmapFile, "-o", resultFile)
if err != nil {
t.Fatalf("RunHitTest failed: %v", err)
}
// 验证结果文件
resultContent, err := os.ReadFile(resultFile)
if err != nil {
t.Fatalf("Failed to read result file: %v", err)
}
lines := strings.Split(strings.TrimSpace(string(resultContent)), "\n")
// 应该找到apple和banana找不到elderberry和fig
expectedHits := 2
actualHits := 0
for _, line := range lines {
if strings.Contains(line, "apple") || strings.Contains(line, "banana") {
actualHits++
}
}
if actualHits != expectedHits {
t.Errorf("Expected %d hits, got %d", expectedHits, actualHits)
}
}
// TestRunInfo 测试info命令
func TestRunInfo(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
bitmapFile := filepath.Join(tempDir, "test.bmp")
// 创建测试bitmap
bf := bloom.NewWithEstimates(1000, 0.01)
bf.AddString("apple")
bf.AddString("banana")
err := bf.SaveToFile(bitmapFile)
if err != nil {
t.Fatalf("Failed to save bitmap: %v", err)
}
// 运行info命令这里只是测试不报错实际输出需要手动验证
err = RunInfo("-b", bitmapFile)
if err != nil {
t.Errorf("RunInfo failed: %v", err)
}
}
// TestRunAnd 测试and命令
func TestRunAnd(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建两个bitmap文件
bitmapFile1 := filepath.Join(tempDir, "test1.bmp")
bitmapFile2 := filepath.Join(tempDir, "test2.bmp")
outputFile := filepath.Join(tempDir, "and_result.bmp")
// 创建第一个bitmap
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf1.AddString("apple")
bf1.AddString("banana")
bf1.AddString("cherry")
bf1.Flush()
err := bf1.SaveToFile(bitmapFile1)
if err != nil {
t.Fatalf("Failed to save bitmap1: %v", err)
}
// 创建第二个bitmap
bf2 := bloom.NewWithEstimates(1000, 0.01)
bf2.AddString("banana")
bf2.AddString("cherry")
bf2.AddString("date")
bf2.Flush()
err = bf2.SaveToFile(bitmapFile2)
if err != nil {
t.Fatalf("Failed to save bitmap2: %v", err)
}
// 运行and命令
err = RunAnd("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile)
if err != nil {
t.Fatalf("RunAnd failed: %v", err)
}
// 验证结果
resultBf, err := bloom.LoadFromFile(outputFile, false)
if err != nil {
t.Fatalf("Failed to load result bitmap: %v", err)
}
// banana和cherry应该存在
if !resultBf.TestString("banana") || !resultBf.TestString("cherry") {
t.Error("Expected common elements missing from AND result")
}
}
// TestRunOr 测试or命令
func TestRunOr(t *testing.T) {
// 创建临时目录
tempDir := t.TempDir()
// 创建两个bitmap文件
bitmapFile1 := filepath.Join(tempDir, "test1.bmp")
bitmapFile2 := filepath.Join(tempDir, "test2.bmp")
outputFile := filepath.Join(tempDir, "or_result.bmp")
// 创建第一个bitmap
bf1 := bloom.NewWithEstimates(1000, 0.01)
bf1.AddString("apple")
bf1.AddString("banana")
bf1.Flush()
err := bf1.SaveToFile(bitmapFile1)
if err != nil {
t.Fatalf("Failed to save bitmap1: %v", err)
}
// 创建第二个bitmap
bf2 := bloom.NewWithEstimates(1000, 0.01)
bf2.AddString("cherry")
bf2.AddString("date")
bf2.Flush()
err = bf2.SaveToFile(bitmapFile2)
if err != nil {
t.Fatalf("Failed to save bitmap2: %v", err)
}
// 运行or命令
err = RunOr("-b1", bitmapFile1, "-b2", bitmapFile2, "-o", outputFile)
if err != nil {
t.Fatalf("RunOr failed: %v", err)
}
// 验证结果
resultBf, err := bloom.LoadFromFile(outputFile, false)
if err != nil {
t.Fatalf("Failed to load result bitmap: %v", err)
}
// 所有元素应该存在
testElements := []string{"apple", "banana", "cherry", "date"}
for _, element := range testElements {
if !resultBf.TestString(element) {
t.Errorf("Element %s missing from OR result", element)
}
}
}
// TestBloomFilterEdgeCases 测试边界情况
func TestBloomFilterEdgeCases(t *testing.T) {
// 测试空字符串
bf := bloom.NewWithEstimates(100, 0.01)
bf.AddString("")
bf.Flush()
if !bf.TestString("") {
t.Error("Empty string should be testable")
}
// 测试长字符串
longString := strings.Repeat("a", 10000)
bf.AddString(longString)
bf.Flush()
if !bf.TestString(longString) {
t.Error("Long string should be testable")
}
// 测试特殊字符
specialString := "测试中文字符符!@#$%^&*()"
bf.AddString(specialString)
bf.Flush()
if !bf.TestString(specialString) {
t.Error("Special characters should be testable")
}
}
// TestBloomFilterConcurrency 测试并发安全性
func TestBloomFilterConcurrency(t *testing.T) {
bf := bloom.NewWithEstimates(10000, 0.01)
// 使用多个goroutine并发添加元素
done := make(chan bool, 10)
for i := 0; i < 10; i++ {
go func(id int) {
for j := 0; j < 100; j++ {
data := fmt.Sprintf("item_%d_%d", id, j)
bf.AddString(data)
}
done <- true
}(i)
}
// 等待所有goroutine完成
for i := 0; i < 10; i++ {
<-done
}
// 刷新所有缓冲区
bf.Flush()
// 验证所有元素都存在
for i := 0; i < 10; i++ {
for j := 0; j < 100; j++ {
data := fmt.Sprintf("item_%d_%d", i, j)
if !bf.TestString(data) {
t.Errorf("Concurrent element %s not found", data)
}
}
}
}
// BenchmarkBloomFilterAdd 性能测试:添加元素
func BenchmarkBloomFilterAdd(b *testing.B) {
bf := bloom.NewWithEstimates(1000000, 0.01)
b.ResetTimer()
for i := 0; i < b.N; i++ {
bf.AddString(fmt.Sprintf("item_%d", i))
}
bf.Flush()
}
// BenchmarkBloomFilterTest 性能测试:测试元素
func BenchmarkBloomFilterTest(b *testing.B) {
bf := bloom.NewWithEstimates(100000, 0.01)
// 预先添加一些元素
for i := 0; i < 1000; i++ {
bf.AddString(fmt.Sprintf("item_%d", i))
}
bf.Flush()
b.ResetTimer()
for i := 0; i < b.N; i++ {
bf.TestString(fmt.Sprintf("item_%d", i%1000))
}
}
// BenchmarkBloomFilterAndOperation 性能测试AND操作
func BenchmarkBloomFilterAndOperation(b *testing.B) {
bf1 := bloom.NewWithEstimates(10000, 0.01)
bf2 := bloom.NewWithEstimates(10000, 0.01)
// 预先添加元素
for i := 0; i < 1000; i++ {
bf1.AddString(fmt.Sprintf("item_%d", i))
bf2.AddString(fmt.Sprintf("item_%d", i))
}
bf1.Flush()
bf2.Flush()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := bf1.And(bf2)
if err != nil {
b.Fatal(err)
}
}
}
// BenchmarkBloomFilterOrOperation 性能测试OR操作
func BenchmarkBloomFilterOrOperation(b *testing.B) {
bf1 := bloom.NewWithEstimates(10000, 0.01)
bf2 := bloom.NewWithEstimates(10000, 0.01)
// 预先添加元素
for i := 0; i < 1000; i++ {
bf1.AddString(fmt.Sprintf("item_%d", i))
bf2.AddString(fmt.Sprintf("item_%d", i+1000))
}
bf1.Flush()
bf2.Flush()
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, err := bf1.Or(bf2)
if err != nil {
b.Fatal(err)
}
}
}