package main import ( "bufio" "flag" "log/slog" "os" "git.algo.com.cn/public/bloomtool/internal/bloom" ) const ( FalseRate = 0.00000001 // 误判率 千万分之一 ) func RunMakeBloom(args ...string) error { fs := flag.NewFlagSet("makebloom", flag.ExitOnError) txtFile := fs.String("d", "", "device id filename") bmpFile := fs.String("b", "", "bitmap filename for output") elements := fs.Uint64("e", 0, "max elements. (max 100 0000 0000). if 0 then auto") falseRate := fs.Float64("r", FalseRate, "false rate (0.01--0.0000 0000 1)") if err := fs.Parse(args); err != nil { return err } else if fs.NArg() > 0 || *txtFile == "" || *bmpFile == "" || *elements > 10000000000 || *falseRate > 0.01 || *falseRate < 0.000000001 { fs.Usage() return nil } return makeBloom(*txtFile, *bmpFile, *elements, *falseRate) } func makeBloom(txtFile string, bmpFile string, elements uint64, falseRate float64) error { // 打开设备号文件 slog.Info("open source file", "filename", txtFile) tfile, err := os.Open(txtFile) if err != nil { slog.Error("open source file error", "err", err) return err } defer tfile.Close() fstat, err := tfile.Stat() if err != nil { slog.Error("source file stat error", "err", err) return err } // 计算元素个数,并预留了一些空间。理论上单行md5为32字节,加回车(1个或2个字节)。 // 这里取30做安全系数。再加10000个保险 maxElements := uint64(0) if elements == 0 { maxElements = uint64((fstat.Size() / 30)) + 10000 } else { maxElements = elements } // 新建布隆过滤器 bloombmp := bloom.NewWithEstimates(maxElements, falseRate) // 逐行读取 scanner := bufio.NewScanner(tfile) lineCount := 1 for scanner.Scan() { if lineCount%100000 == 0 { slog.Info("read line", "lineno", lineCount) } // 转换成bloom bit 写入 bloombmp.AddString(scanner.Text()) lineCount++ } // 保存文件 slog.Info("save bitmap file", "filename", bmpFile) err = bloombmp.SaveToFile(bmpFile) if err != nil { slog.Error("save bitmap file error", "err", err) return err } slog.Info("save bitmap file done") return nil }