區域性敏感雜湊原始LSH C 實現

之前專案中用到lsh演算法來做特徵檢索，對lsh演算法很好奇，最近看了lsh的**，依照自己的理解，初步寫了lsh**。測試效果不是特別理想，引數的選擇也基本靠嘗試，姑且先把**放上來，之後再改進吧（2016.01.24）

#include 
#include 
#include 
#include 
using
namespace
std;
using
namespace cv;
/* 演算法思路
1. 確定引數 k(bitcount)、以及雜湊表個數l; （這裡設定固定值吧 不知道怎麼求最優質）
2. 生成一級雜湊函式;
3. 資料一級雜湊到bucket
4. 壓縮bucket.用二級雜湊函式把一級的bucket雜湊到二級雜湊表中（需要確定二級雜湊表的容量m）;
*/const
int k = 700; //或者用展開成01串的特徵的長度來計算k 
const
int l = 10;
const
int maxvalue = 255;//特徵為uchar型別
const
int m = 100; //二級雜湊表的容量
struct bucket
;struct table 
;class mylsh
;~mylsh(){};
public:
int init( int m, int l, float ratio,int featuredims );
int train(mat featurtes);
int search(mat feature,int& name, int& dist);//輸入查詢特徵 返回特徵名 和距離
private:
//int creathashfamily(); 
int gethashfun_level1();//生成一級雜湊函式
int hash_level1( mat feature,int hashfunid,vector
char>& value );//輸入一級雜湊函式、原始特徵，得到原始特徵的對映value;這裡用的演算法避免了把數字展開成01串
int hash_level2( vector
char> value, int& bucketid );//輸入一級雜湊對映後的value，計算二級雜湊對映後的value(也就是二級雜湊表中的桶的id)
int calcdist( mat feature0, mat feature1,int& dist);
private:
int m_k;//隨機選取的位數
int m_d;//轉換成01串後的特徵維數;當然得注意，實際並沒有顯式的轉成01字串，m_d只是用到而已；
int m_l;//一級雜湊表的個數
int m_m;//二級雜湊表的容量
int m_featuredims;//影象特徵維數
vector
> m_hashfun_level1;//現在沒用到
vector
>> m_hashfun_level1subset; 
vector
m_hashfun_level2;
table m_table;
//vector> m_hashfamily;
};#include "mylsh.h"
#include 
int mylsh::init( int m, int l, float ratio,int featuredims)
for ( i = 0; i //初始化二級雜湊函式
}return1;}
int mylsh::train(mat featurtes)
nums = featurtes.rows;
for ( i = 0; i < nums; i++ )//every img
//二級hash 
ret = hash_level2(value,bucketid);
if ( 0 == ret )
//把特徵以及特徵對應名稱存到桶中
m_table.bucket[bucketid].features.push_back(feature.clone());
m_table.bucket[bucketid].names.push_back(i);
m_table.bucket[bucketid].bucketsize = int(m_table.bucket[bucketid].names.size());}}
return1;}
//生成一級hash函式
int mylsh::gethashfun_level1()
}//m_hashfun_level1.push_back(temp);
}return1;}
int mylsh::hash_level1( mat feature,int hashfunid,vector
char>& value )
else
while ( one_nums > 0 )
while ( zero_nums > 0 )}}
return1;}
int mylsh::hash_level2( vector
char> value, int& bucketid )
bucketid = val % m_m;
return1;}
int mylsh::search(mat feature,int& name, int& dist)
dist = -1;
name = -1;
int ret,i,j,bucketid;
vector
char> value;
vector
buckets;
vector
tempfeature;
vector
tempname;
float mindist = 1000000;
for ( i = 0; i < m_l; i++ )
//二級hash 
ret = hash_level2(value,bucketid);
if ( 0 == ret )
buckets.push_back(bucketid);
}//在buckets內的所有特徵中尋找距離最小的點  bruteforce
cout
<<"候選桶的數量為："
<<(int)buckets.size()tempname = m_table.bucket[buckets[i]].names;
cout
<<"該候選桶內含點的數量為："
<<(int)tempfeature.size()if ( dist < mindist )
}tempfeature.clear();
tempname.clear();
}dist = mindist;
return1;}
int mylsh::calcdist(mat feature0, mat feature1,int&dist)
dist = 0;
int i;
int dims = feature0.cols;
for ( i = 0; i abs(feature0.at(0,i) - feature1.at(0,i));
}return1;}
#include "mylsh.h"
#include 
int main()
tempfeature.at(0,j) -= 10;
ret = lsh.search(tempfeature,name,dist);
if ( name == i )
}cout
<<"accuracy:"
1;}

區域性敏感雜湊LSH

參考資料簡單介紹在茫茫人海中發現相似的你區域性敏感雜湊 lsh 基本思想區域性敏感雜湊的基本思想類似於一種空間域轉換思想，lsh演算法基於乙個假設，如果兩個文字在原有的資料空間是相似的，那麼分別經過雜湊函式轉換以後的它們也具有很高的相似度相反，如果它們本身是不相似的，那麼經過轉換後它們應仍...

區域性敏感雜湊演算法

該演算法是一種衡量文字相似度的演算法，下面介紹一下這個演算法的主要思想。假設我們有三段文字 1，我愛你中國。2，我愛北京天安門。3，我愛吃蘋果一，分詞，形成如下矩陣文字 1 文字 2 文字 3 我 1 1 1 愛 1 1 1 你 1 0 0 中國 1 0 0 北京 0 1 0 天安門 0 1 0...

區域性敏感雜湊之分層法與雜湊碼法

學到現在越來越感覺計算機網路作業系統的重要性，組成原理到沒感覺出來，求推薦資料，我想要的是描述性解釋，教材不是我想要的，謝謝！感覺自己的知識很老舊，在沒有出國也沒去高水平大學的條件下，只能通過網路學習了，感謝。在檢索技術中，索引一直需要研究的核心技術。當下，索引技術主要分為三類基於樹的索引技術 ...

區域性敏感雜湊 原始LSH C 實現

區域性敏感雜湊LSH

區域性敏感雜湊演算法

區域性敏感雜湊之分層法與雜湊碼法

相關推薦

區域性敏感雜湊原始LSH C 實現