学习先知社区xq17大佬文章:探索URL相似去重的可落地实践 (一) 文章地址: https://xz.aliyun.com/t/13121?time__1311=mqmxnDBDcD0GQ0KDsQoYK0%3DFwZxI2GhGDbD&alichlgref=https%3A%2F%2Fwww.google.com%2F

1.SimHash介绍

simhash最早由Moses Charikar于论文《similarity estimation techniques from rounding algorithms》提出,论文地址:https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf Google于论文《Detecting Near-Duplicates for Web Crawling》将该算法运用于网页相似性比较,论文地址: https://dl.acm.org/doi/abs/10.1145/1242572.1242592

2.已有项目解读

2.1 Simhash生成过程

Golang实现的SimHash项目:https://github.com/mfonda/simhash

package main

import (
"fmt"
"github.com/mfonda/simhash")

func main() {
var docs = [][]byte{
[]byte("this is a test phrase"),
[]byte("this is a test phrass"),
[]byte("foo bar"),
}

hashes := make([]uint64, len(docs))
for i, d := range docs {
hashes[i] = simhash.Simhash(simhash.NewWordFeatureSet(d))
fmt.Printf("Simhash of %s: %x\\\\n", d, hashes[i])
}

fmt.Printf("Comparison of `%s` and `%s`: %d\\\\n", docs[0], docs[1], simhash.Compare(hashes[0], hashes[1]))
fmt.Printf("Comparison of `%s` and `%s`: %d\\\\n", docs[0], docs[2], simhash.Compare(hashes[0], hashes[2]))
}

跑出来:

Simhash of this is a test phrase: 8c3a5f7e9ecb3f35
Simhash of this is a test phrass: 8c3a5f7e9ecb3f21
Simhash of foo bar: d8dbe7186bad3db3
Comparison of `this is a test phrase` and `this is a test phrass`: 2
Comparison of `this is a test phrase` and `foo bar`: 29

重点算法函数是simhash.Simhash 循环遍历docs把上面的bytes切片经过NewWordFeatureSet函数再放到Simhash里运行

1.NewWordFeatureSet

goland ctrl+b 进到函数里

func NewWordFeatureSet(b []byte) *WordFeatureSet {
fs := &WordFeatureSet{b}
//将b全部转换为小写
fs.normalize()
return fs
}

type WordFeatureSet struct {
b []byte
}

func (w *WordFeatureSet) normalize() {
w.b = bytes.ToLower(w.b)
}

WordFeatureSet:结构体有b字段。 normalize:用于将该字段转换为小写。 NewWordFeatureSet:通过将外面调用传入的参数b转换成fs结构体的b字段,再将fs的b字段变为小写,最后返回fs,fs为WordFeatureSet结构体指针。

2. Simhash

1.进入Simhash函数

func Simhash(fs FeatureSet) uint64 {
return Fingerprint(Vectorize(fs.GetFeatures()))
}

// Returns a []Feature representing each word in the byte slice
func (w *WordFeatureSet) GetFeatures() []Feature {
return getFeatures(w.b, boundaries)
}

2.w.b就是之前传入的经过更改的切片,尝试定位到boundaries发现是正则 进一步进入getFeatures函数

var boundaries = regexp.MustCompile(`[\\\\w']+(?:\\\\://[\\\\w\\\\./]+){0,1}`)

func getFeatures(b []byte, r *regexp.Regexp) []Feature {
//将原有[]byte分为单词数组
words := r.FindAll(b, -1)
//根据单词数量创建空间
features := make([]Feature, len(words))
for i, w := range words {
//获取每个单词的feature
//feature默认有两个参数(Value,Weight)
features[i] = NewFeature(w)
}
//一个句子返回一个features
return features
}

[\\w']+表示匹配一个或多个字母、数字、下划线或单引号 ‘表示一个单引号 (?:\://[\\w\./]+){\0,\1} 这是一个非捕获组,(?: … ) 表示一个非捕获组,{0,1} 表示该组出现零次或一次。在该组内部: \://:匹配 :// 字符序列。 [\\w\./]+ :匹配一个或多个字母、数字、下划线、点号或斜杠。[\w\./] 匹配字母、数字、下划线、点号或斜杠。 大体匹配结果: 一个或多个字母、数字、下划线或单引号,后面跟着零次或一次的 :// 字符序列,再跟着一个或多个字母、数字、下划线、点号或斜杠。 测试:

tt := re.FindAll([]byte("aaa123..  <http://asdsd.com/123?1212>"), -1)
//[aaa123 <http://asdsd.com/123> 1212]

虽然是分隔://匹配但可能会导致匹配url无意义

匹配完成后创建words长度的Feature列表features,遍历结果words将每个单词经过NewFeature存入features列表中。 传入的匹配每个单词经过fnv哈希算法生成uint64的哈希结果,并设默认权重为1,将features返回。

type feature struct {
sum uint64
weight int
}

func (f feature) Sum() uint64 {
return f.sum
}

func (f feature) Weight() int {
return f.weight
}

type Feature interface {
Sum() uint64
Weight() int
}

func NewFeature(f []byte) feature {
h := fnv.New64()
h.Write(f)
//value为哈希值,默认权重为1
return feature{h.Sum64(), 1}
}

3.回到Simhash函数中进入Vectorize函数 v是一个64位大小的整形,循环64次,每次将64位fnv哈希结果右移i位再做与运算,结果为1则v该位权重加1,否则权重减1。

type Vector [64]int

func Vectorize(features []Feature) Vector {
var v Vector
//遍历features里每个单词的feature
for _, feature := range features {
//获取单个单词的Sum
sum := feature.Sum()
//获取单个单词的Weight
weight := feature.Weight()
//64次循环
for i := uint8(0); i < 64; i++ {
//依次获取由大到小每一位二进制位
bit := ((sum >> i) & 1)
//如果该店bit值为1,则该位权重增加
if bit == 1 {
v[i] += weight
} else {
v[i] -= weight
}
}
}
return v
}

单独拉出下面的代码测试,发现跑出来是输入数字的二进制编码的每一位

sum := 217
for i := uint8(0); i < 8; i++ {
bit := ((sum >> i) & 1)
fmt.Printf("%s\n", bit)
}

原代码右移左侧补0,上述操作可以求得由大到小每一feature的sum的二进制位。如果有feature的二进制位为1则v进1,否则v减1。

4.得到的v传到Fingerprint中,遍历v的每一位,如果大于零则f |= (1 << i),得到最终结果f

func Fingerprint(v Vector) uint64 {
var f uint64
for i := uint8(0); i < 64; i++ {
if v[i] >= 0 {
f |= (1 << i)
}
}
return f
}

2.2 相似度比较

1.汉明距离 汉明距离是一个在信息论和编码理论中非常重要的概念,它指的是两个等长字符串之间对应位置上不同字符的个数。例如,对于字符串“1011101”和“1001001”,它们的汉明距离是2,因为它们在从右往左数第3位和第5位上不同。参考:汉明距离

2.查看项目实现 传入待比较的两个Simhash结果

 func Compare(a uint64, b uint64) uint8 {
//a与b取异或(按位比较如果相同结果为0,不同结果为1)
v := a ^ b
// uint8 0-255
var c uint8
//循环条件为 v不等于0
for c = 0; v != 0; c++ {
//消除v中最右边的1
v &= v - 1
}
//消除完v里所有的1,循环的次数就是c的值
return c
}

int64有64位,uint8大小不会超过64

3.相似度标准化

1.标准化:[https://zh.wikipedia.org/wiki/标准化_(统计学](https://zh.wikipedia.org/wiki/标准化_(统计学 汉明距离小于3可认为两者相似,可以得到概率公式,当相似度大于等于95%两个比较文本hash指纹相汉明距离应小于3

2原文文章里提到的一开始作者想到的相似度计算方法: $$P:100-(\frac{x}{64}*100)$$ 其中当x>=0,若X<=3 P>=95

4.url处理

1.调用net/url可以解析url路径

package main

import (
"fmt"
"net/url")

func main() {
// 要解析的 URL 字符串
urlString := "<https://example.com/path?foo=bar&baz=qux>"

// 解析 URL 字符串
parsedURL, err := url.Parse(urlString)
if err != nil {
fmt.Println("Error parsing URL:", err)
return
}

// 打印解析后的各个部分
fmt.Println("Scheme:", parsedURL.Scheme)
fmt.Println("Host:", parsedURL.Host)
fmt.Println("Path:", parsedURL.Path)
fmt.Println("Query:", parsedURL.RawQuery)
}

2.对url分权重,即 frament: 0.5 schema: 0.5 params: 2 paths:3 hostname: 4 total = 1 + 2 + 3 + 4 = 10

func Parse(rawURL string) (*URL, error) { //http://www.aaa.com:80/123/7.php?a=1&b=2&c=3#h11aha1
// Cut off #frag
u, frag, _ := strings.Cut(rawURL, "#") //http://www.aaa.com:80/123/7.php?a=1&b=2&c=3
url, err := parse(u, false)
if err != nil {
return nil, &Error{"parse", u, err}
}
if frag == "" {
return url, nil
}
if err = url.setFragment(frag); err != nil {
return nil, &Error{"parse", rawURL, err}
}
return url, nil
}

func parse(rawURL string, viaRequest bool) (*URL, error) {
var rest string
var err error
//URL不能出现控制字符
if stringContainsCTLByte(rawURL) {
return nil, errors.New("net/url: invalid control character in URL")
}
if rawURL == "" && viaRequest {
return nil, errors.New("empty url")
}
url := new(URL)

if rawURL == "*" {
url.Path = "*"
return url, nil
}
//获取scheme
if url.Scheme, rest, err = getScheme(rawURL); err != nil {
return nil, err
}
//scheme转换成小写
url.Scheme = strings.ToLower(url.Scheme)
//提取param,如果以?结尾则去掉?
//反之以右边部分赋值给RawQuery
if strings.HasSuffix(rest, "?") && strings.Count(rest, "?") == 1 {
url.ForceQuery = true
rest = rest[:len(rest)-1]
} else {
rest, url.RawQuery, _ = strings.Cut(rest, "?")
}
//
if !strings.HasPrefix(rest, "/") {
if url.Scheme != "" {
url.Opaque = rest
return url, nil
}
if viaRequest {
return nil, errors.New("invalid URI for request")
}

if segment, _, _ := strings.Cut(rest, "/"); strings.Contains(segment, ":") {
return nil, errors.New("first path segment in URL cannot contain colon")
}
}

if (url.Scheme != "" || !viaRequest && !strings.HasPrefix(rest, "///")) && strings.HasPrefix(rest, "//") {
var authority string
authority, rest = rest[2:], ""
if i := strings.Index(authority, "/"); i >= 0 {
authority, rest = authority[:i], authority[i:]
}
url.User, url.Host, err = parseAuthority(authority)
if err != nil {
return nil, err
}
} else if url.Scheme != "" && strings.HasPrefix(rest, "/") {
url.OmitHost = true
}
if err := url.setPath(rest); err != nil {
return nil, err
}
return url, nil
}

//避免有控制字符
func stringContainsCTLByte(s string) bool {
for i := 0; i < len(s); i++ {
b := s[i]
if b < ' ' || b == 0x7f {
return true
}
}
return false
}

//获取schema
func getScheme(rawURL string) (scheme, path string, err error) {
for i := 0; i < len(rawURL); i++ {
c := rawURL[i]
switch {
case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
// do nothing
case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
if i == 0 {
return "", rawURL, nil
}
case c == ':':
if i == 0 {
return "", "", errors.New("missing protocol scheme")
}
return rawURL[:i], rawURL[i+1:], nil
default:
// we have encountered an invalid character,
// so there is no valid scheme return "", rawURL, nil
}
}
return "", rawURL, nil
}

3.每个url分块权重策略:外面传进来的每一行url分解成五块分别是scheme+hostname+paths+params+frag 权重初始化设置为 frament:0.5 schema:0.5 params:2 paths:3 hostname:4

5. 方案实现

1.在原来文章基础修改
原文提供关键代码(改了url组成的变量名):

func GetFeaturesFromURI(uri string) ([]Feature, error) {  
parse,err := url.Parse(uri)
if err!= nil{
return 1,err
}
urlWeights := Setutval_wei(parse.Host,parse.Path,parse.RawQuery,parse.Fragment,parse.Scheme)

//处理url
urlWeights.Path.Value = strings.ReplaceAll(urlWeights.Path.Value,"//","/")
_,urlWeights.Path.Value,_ = strings.Cut(urlWeights.Path.Value,"/")
urlWeights.RawQuery.Value = strings.ReplaceAll(urlWeights.RawQuery.Value,"&&","&")
//路径分割、参数分割
pathSplit := strings.Split(urlWeights.Path.Value,"/")
paramSplit := strings.Split(urlWeights.RawQuery.Value,"&")

//两块小N权重计算
pathWeight := calculateWeight(urlWeights.Path.Weight,len(pathSplit))
paramWeight := calculateWeight(urlWeights.RawQuery.Weight,len(paramSplit))

//返回结果初始化
//文章方法默认没有加上Fragment
features := make([]Feature,0,len(pathSplit)+len(paramSplit)+2)
appendFeature := func(val string, weight float64) {
features = append(features, NewFeatureWithWeight([]byte(val),weight))
}
//加入元素
appendFeature(urlWeights.Scheme.Value,urlWeights.Scheme.Weight)
appendFeature(urlWeights.Host.Value,urlWeights.Host.Weight)

//2块小N
for _,value := range pathSplit {
appendFeature(value,pathWeight)
}

for _,value := range paramSplit {
appendFeature(value,paramWeight)
}
return features,err
}

//calculate
func calculateWeight(totalWeight float64, partsCount int) float64 {
if partsCount > 0 {
return totalWeight / float64(partsCount)
}
return totalWeight
}

//百分比计算
func similarity(a uint64, b uint64) float64 {
percent := Compare(a, b)
return 100 - (float64(percent)/64.0)*100
}

url结构体设计与赋值

type pw struct{  
Value string
Weight float64
}

type urlt struct {
Host pw
Path pw
RawQuery pw
Fragment pw
Scheme pw
}

func Setutval_wei(val1, val2, val3, val4, val5 string) urlt {
//根据先前设置好的权重比例赋值
return urlt{
Host: pw{
Value: val1,
Weight:4,
},
Path: pw{
Value: val2,
Weight:3,
},
RawQuery: pw{
Value: val3,
Weight:2,
},
Fragment: pw{
Value: val4,
Weight: 0.5,
},
Scheme: pw{
Value: val5,
Weight: 0.5,
},
}
}

2.在此基础上完成发现Weight类型报错,一路溯回去找到feature结构体,把weight改为float64
把对应调用到weight的结构体函数等类型做好更改,将Vector类型改为[64]float64

3.总代码如下(为了节省篇幅删除了部分原有注释):

package simhash  

import (
"bytes"
"golang.org/x/text/unicode/norm"
"hash/fnv"
"net/url"
"regexp"
"strings"
)

type Vector [64]float64

// Feature consists of a 64-bit hash and a weight
type Feature interface {
// Sum returns the 64-bit sum of this feature
Sum() uint64
// Weight returns the weight of this feature
Weight() float64
}

type FeatureSet interface {
GetFeatures() []Feature
}

func Vectorize(features []Feature) Vector {
var v Vector
//遍历features里每个单词的feature
for _, feature := range features {
//获取单个单词的Sum
sum := feature.Sum()
//获取单个单词的Weight
weight := feature.Weight()
//64次循环
for i := uint8(0); i < 64; i++ {
//依次获取由大到小每一位二进制位
bit := ((sum >> i) & 1)
//如果该店bit值为1,则该位权重增加
if bit == 1 {
v[i] += weight
} else {
v[i] -= weight
}
}
}
return v
}

func VectorizeBytes(features [][]byte) Vector {
var v Vector
h := fnv.New64()
for _, feature := range features {
h.Reset()
h.Write(feature)
sum := h.Sum64()
for i := uint8(0); i < 64; i++ {
bit := ((sum >> i) & 1)
if bit == 1 {
v[i]++
} else {
v[i]--
}
}
}
return v
}

func Fingerprint(v Vector) uint64 {
var f uint64
for i := uint8(0); i < 64; i++ {
if v[i] >= 0 {
f |= (1 << i)
}
}
return f
}

type feature struct {
sum uint64
weight float64
}

func (f feature) Sum() uint64 {
return f.sum
}

func (f feature) Weight() float64 {
return f.weight
}

func NewFeature(f []byte) feature {
h := fnv.New64()
h.Write(f)
return feature{h.Sum64(), 1}
}

func NewFeatureWithWeight(f []byte, weight float64) feature {
fw := NewFeature(f)
fw.weight = weight
return fw
}

func Compare(a uint64, b uint64) uint8 {
v := a ^ b
var c uint8
for c = 0; v != 0; c++ {
v &= v - 1
}
return c
}

func Simhash(fs FeatureSet) uint64 {
return Fingerprint(Vectorize(fs.GetFeatures()))
}

func SimhashBytes(b [][]byte) uint64 {
return Fingerprint(VectorizeBytes(b))
}

type WordFeatureSet struct {
b []byte
}

func NewWordFeatureSet(b []byte) *WordFeatureSet {
fs := &WordFeatureSet{b}
fs.normalize()
return fs
}

func (w *WordFeatureSet) normalize() {
w.b = bytes.ToLower(w.b)
}

var boundaries = regexp.MustCompile(`[\w']+(?:\://[\w\./]+){0,1}`)
var unicodeBoundaries = regexp.MustCompile(`[\pL-_']+`)


func (w *WordFeatureSet) GetFeatures() []Feature {
return getFeatures(w.b, boundaries)
}

type UnicodeWordFeatureSet struct {
b []byte
f norm.Form
}

func NewUnicodeWordFeatureSet(b []byte, f norm.Form) *UnicodeWordFeatureSet {
fs := &UnicodeWordFeatureSet{b, f}
fs.normalize()
return fs
}

func (w *UnicodeWordFeatureSet) normalize() {
b := bytes.ToLower(w.f.Append(nil, w.b...))
w.b = b
}

func (w *UnicodeWordFeatureSet) GetFeatures() []Feature {
return getFeatures(w.b, unicodeBoundaries)
}

func getFeatures(b []byte, r *regexp.Regexp) []Feature {
//将原有[]byte分为单词数组
words := r.FindAll(b, -1)
//根据单词数量创建空间
features := make([]Feature, len(words))
for i, w := range words {
//获取每个单词的feature
//feature默认有两个参数(Value,Weight)
features[i] = NewFeature(w)
}
//一个句子返回一个features
return features
}

func Shingle(w int, b [][]byte) [][]byte {
if w < 1 {
// TODO: use error here instead of panic?
panic("simhash.Shingle(): k must be a positive integer")
}

if w == 1 {
return b
}

if w > len(b) {
w = len(b)
}

count := len(b) - w + 1
shingles := make([][]byte, count)
for i := 0; i < count; i++ {
shingles[i] = bytes.Join(b[i:i+w], []byte(" "))
}
return shingles
}

type pw struct{
Value string
Weight float64
}

type urlt struct {
Host pw
Path pw
RawQuery pw
Fragment pw
Scheme pw
}



//func getFeaturesFromURI(uri string) ([]Feature, error) {
func GetFeaturesFromURI(uri string) []Feature {
parse,err := url.Parse(uri)
if err!= nil{
return nil
}
urlWeights := Setutval_wei(parse.Host,parse.Path,parse.RawQuery,parse.Fragment,parse.Scheme)

//处理url
urlWeights.Path.Value = strings.ReplaceAll(urlWeights.Path.Value,"//","/")
_,urlWeights.Path.Value,_ = strings.Cut(urlWeights.Path.Value,"/")
urlWeights.RawQuery.Value = strings.ReplaceAll(urlWeights.RawQuery.Value,"&&","&")
//路径分割、参数分割
pathSplit := strings.Split(urlWeights.Path.Value,"/")
paramSplit := strings.Split(urlWeights.RawQuery.Value,"&")

//两块小N权重计算
pathWeight := calculateWeight(urlWeights.Path.Weight,len(pathSplit))
paramWeight := calculateWeight(urlWeights.RawQuery.Weight,len(paramSplit))

//返回结果初始化
//文章方法默认没有加上Fragment
features := make([]Feature,0,len(pathSplit)+len(paramSplit)+2)
appendFeature := func(val string, weight float64) {
features = append(features, NewFeatureWithWeight([]byte(val),weight))
}
//加入元素
appendFeature(urlWeights.Scheme.Value,urlWeights.Scheme.Weight)
appendFeature(urlWeights.Host.Value,urlWeights.Host.Weight)

//2块小N
for _,value := range pathSplit {
appendFeature(value,pathWeight)
}

for _,value := range paramSplit {
appendFeature(value,paramWeight)
}
return features
}


func Setutval_wei(val1, val2, val3, val4, val5 string) urlt {
return urlt{
Host: pw{
Value: val1,
Weight:4,
},
Path: pw{
Value: val2,
Weight:3,
},
RawQuery: pw{
Value: val3,
Weight:2,
},
Fragment: pw{
Value: val4,
Weight: 0.5,
},
Scheme: pw{
Value: val5,
Weight: 0.5,
},
}
}

//算权重
func calculateWeight(totalWeight float64, partsCount int) float64 {
if partsCount > 0 {
return totalWeight / float64(partsCount)
}
return totalWeight
}

main.go

package main  

import (
"bufio"
"fmt" "github.com/mfonda/simhash" "os")

func main() {
//文件路径
filePath := "test.txt"
//文件里读出url
urls, err := inputFile(filePath)
if err != nil {
fmt.Println("Error:", err)
return
}
//遍历url
hashes := make([]uint64, len(urls))
fmt.Printf("Result of Simhash :\n")
for i, single_url := range urls {
hashes[i] = simhash.Fingerprint(simhash.Vectorize(simhash.GetFeaturesFromURI(string(single_url))))
fmt.Printf("Simhash of %s is %x\n", single_url, hashes[i])
}
fmt.Printf("Result of Comparision :\n")
for i, _ := range urls {
for j := i + 1; j < len(urls); j++ {
fmt.Printf("Comparison of `%s` and `%s`: %.5f%% \n", urls[i], urls[j], similarity(hashes[i], hashes[j]))
}
}
fmt.Printf("Comparison of `%s` and `%s`: %.5f%% \n", urls[0], urls[1], similarity(hashes[0], hashes[1]))
}

func inputFile(filePath string) ([][]byte, error) {
// 打开文本文件
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("error opening file: %v", err)
}
defer file.Close()

var urls [][]byte

// 逐行读取文件内容
scanner := bufio.NewScanner(file)
for scanner.Scan() {
// 获取每行的 URL url := scanner.Text()

// 将URL转换为所需的格式
formattedURL := []byte(url)

// 添加到切片中
urls = append(urls, formattedURL)
}

// 检查扫描过程中是否有错误
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error scanning file: %v", err)
}

return urls, nil
}

// 百分比计算
func similarity(a uint64, b uint64) float64 {
percent := simhash.Compare(a, b)
return 100 - (float64(percent)/64.0)*100
}

运行测试:

6.工具开发

方案

6.1 txt文件去重实现

txt去重实现:
例如有5个url需要比4+3+2+1 次
创建一个map[string]bool名为Resmap,所有txt中的url的bool值为1,
第一轮对比下和第一个url相似度高的将Resmap值设为False,以此类推。

func main() {  
//文件路径
filePath := "test.txt"
savePath := "res.txt"
//文件里读出url
urls, err := inputFile(filePath)
if err != nil {
fmt.Println("Error:", err)
return
}
//遍历url 得到Simhash结果
hashes := make([]uint64, len(urls))
for i, single_url := range urls {
hashes[i] = simhash.Fingerprint(simhash.Vectorize(simhash.GetFeaturesFromURI(string(single_url))))
fmt.Printf("[-] "+time.Now().Format("2006-01-02 15:04:05")+" Simhash of %s is %x\n", single_url, hashes[i])
}

ResMap := make(map[string]bool)
for k, _ := range urls {
ResMap[string(urls[k])] = true
}
for i, _ := range urls {
for j := i + 1; j < len(urls); j++ {
cp_res := similarity(hashes[i], hashes[j])
//fmt.Printf("Comparison of `%s` and `%s`: %.5f%% \n", urls[i], urls[j], cp_res)
if cp_res > 95.0 {
ResMap[string(urls[j])] = false
}
}
}
// 遍历linesMap
for line, istrue := range ResMap {
if istrue {
line = fmt.Sprintf(line + "\n")
saveFile(savePath, line)
}
}

}

效果展示:
测试样例
运行结果

过滤掉了相似param的url。

6.2 软件封装

原本准备用cobra写的,写到一半从杨宝那得知有封装好的flag,选择用flag写

package main  

import (
"bufio"
"flag" "fmt" "github.com/mfonda/simhash" "os" "strings" "time")

var (
filep string
savep string
similar float64
wt string
)

const epsilon = 1e-7

func main() {
flag.StringVar(&filep, "f", "./urls.txt", "指定待去重url文件路径")
flag.StringVar(&savep, "o", "./output.txt", "去重结果输出至文件")
flag.Float64Var(&similar, "s", 0.95, "指定相似度,去除比较结果中高于该相似度的url")
flag.StringVar(&wt, "p", "4:3:2:0.5:0.5", "自定义host:path:param:frag:scheme的比例,请参照默认值格式输入")
flag.Parse()
//权重改为列表
wa := strings.Split(wt, ":")
fmt.Printf("%s\n", wa)
if len(wa) == 5 {
// 检查文件是否存在
if _, err := os.Stat(filep); err == nil {
url_de(filep, savep, similar, wa)
} else if os.IsNotExist(err) {
fmt.Println("文件不存在!")
} else {
fmt.Println("发生错误:", err)
}
} else {
fmt.Println("请输入正确格式的数据")
}

}
func url_de(filep string, savep string, similar float64, weight_arr []string) {
//文件路径
filePath := filep
//文件保存地址
savePath := savep

//文件里读出url
urls, err := inputFile(filePath)
if err != nil {
fmt.Println("Error:", err)
return
}
//遍历url 得到Simhash结果
hashes := make([]uint64, len(urls))
for i, single_url := range urls {
hashes[i] = simhash.Fingerprint(simhash.Vectorize(simhash.GetFeaturesFromURI(string(single_url), weight_arr)))
fmt.Printf("[-] "+time.Now().Format("2006-01-02 15:04:05")+" Simhash of %s is %x\n", single_url, hashes[i])
}
fmt.Printf("Simhash比对中 ...\n")
ResMap := make(map[string]bool)
for k, _ := range urls {
ResMap[string(urls[k])] = true
}
for i, _ := range urls {
for j := i + 1; j < len(urls); j++ {
cp_res := similarity(hashes[i], hashes[j])
//fmt.Printf("Comparison of `%s` and `%s`: %.5f%% \n", urls[i], urls[j], cp_res)
if (cp_res/100 - similar) > epsilon {
ResMap[string(urls[j])] = false
}
}
}
fmt.Printf("结果保存中 ...\n")
// 遍历linesMap
for line, istrue := range ResMap {
if istrue {
line = fmt.Sprintf(line + "\n")
saveFile(savePath, line)
}
}
fmt.Printf("结果保存至" + savePath)
}

func saveFile(savePath, str string) {
file, err := os.OpenFile(savePath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0644)
if err != nil {
panic(err)
}
defer file.Close()

// 将字符串写入文件
_, err = file.WriteString(str)
if err != nil {
panic(err)
}
}

func inputFile(filePath string) ([][]byte, error) {
// 打开文本文件
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("error opening file: %v", err)
}
defer file.Close()

var urls [][]byte

// 逐行读取文件内容
scanner := bufio.NewScanner(file)
for scanner.Scan() {
// 获取每行的 URL url := scanner.Text()

// 将URL转换为所需的格式
formattedURL := []byte(url)

// 添加到切片中
urls = append(urls, formattedURL)
}

// 检查扫描过程中是否有错误
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error scanning file: %v", err)
}

return urls, nil
}

// 百分比计算
func similarity(a uint64, b uint64) float64 {
percent := simhash.Compare(a, b)
return 100 - (float64(percent)/64.0)*100
}

权重传入只需赋给原始权重即可。

使用示例: