<progress id="1qxls"></progress>

<progress id="1qxls"><rt id="1qxls"></rt></progress>

溫馨提示×

溫馨提示×

您好，登錄后才能下訂單哦！

密碼登錄×

忘記密碼？

登錄注冊(cè)×

獲取短信驗(yàn)證碼

其他方式登錄

點(diǎn)擊登錄注冊(cè) 即表示同意《億速云用戶服務(wù)條款》

用戶登錄×

賬戶密碼登錄

請(qǐng)使用微信掃描上方二維碼

使用幫助

請(qǐng)求超時(shí)！

請(qǐng)點(diǎn)擊重新獲取二維碼

位圖（BitMap）&& 布隆過濾器（BloomFilter）

發(fā)布時(shí)間：2020-07-18 04:44:02 來源：網(wǎng)絡(luò) 閱讀：1043 作者：威尼斯小艇欄目：編程語言

【面試題】給40億個(gè)不重復(fù)的無符號(hào)整數(shù)，沒排過序。給一個(gè)無符號(hào)整數(shù)，如何快速判斷一個(gè)數(shù)是否在這40億個(gè)數(shù)中。

● 在看到這個(gè)題后最先想到的方法是遍歷這40億個(gè)數(shù)，依次進(jìn)行判斷，但此做法需要的內(nèi)存很大，大約為15G（4000000000 * 4 ÷（1024*1024*1024）），可見此算法不可取。

● 如果內(nèi)存夠的話，我們可以通過位圖實(shí)現(xiàn)，位圖一個(gè)數(shù)組每個(gè)數(shù)據(jù)的每個(gè)二進(jìn)制位表示一個(gè)數(shù)據(jù)，每一位用0，1表示當(dāng)前這個(gè)位置上是否存有值，同樣是利用哈希存儲(chǔ)的方法。此做法可以大大減少內(nèi)存，對(duì)于此題是一個(gè)int類型就可以編程32個(gè)位，需要的內(nèi)存空間從15G降到500M。

具體實(shí)現(xiàn)如下：

#pragma
class BitMap//位圖
{
public:
    BitMap()
    :_size(0)
    {}
    BitMap(size_t size)//size表示多少位，不是數(shù)據(jù)個(gè)數(shù)
    : _size(size)
    {//調(diào)整大小為size / 32 + 1即右移5位加1（加1：需要的大小要包含size，例如10%8=1，大小應(yīng)為2）
    	_a.resize((size >> 5) + 1);
    }
    //位圖中，注意1在移位時(shí)為左移num不是左移32-num；
    void Set(size_t x)//存入x位，置1
    {
    	size_t index = x >> 5;
    	size_t num = x % 32;//eg:x = 35,num = 3,則在位圖中為_a[1]中設(shè)為001
    	++_size;
    	_a[index] |= 1 << num;//1左移3位，進(jìn)行|使_a中對(duì)應(yīng)處為
    }
    void Remove(size_t x)//刪除x位，置0
    {
    	size_t index = x >> 5;
    	size_t num = x % 32;//eg:x = 35,num = 3,則在位圖中為_a[1]中設(shè)為110
    	--_size;
    	_a[index] &= (~(1 << num));//1右移3位取反0，進(jìn)行&使_a中對(duì)應(yīng)處為0
    }
    bool Test(size_t x)//判斷是否存在
    {
    	size_t index = x >> 5;
    	size_t num = x % 32;
    	if (_a[index] & (1 << num))//如果當(dāng)前位為1，則存在
    	{
    		return true;
    	}
    	return false;
    }
    void Resize(size_t size)//重置大小
    {
    	_a.resize((size >> 5) + 1);
    }
    size_t Size()//返回位圖的總位數(shù)
    {
    	return _size;
    }
    size_t Capacity()//返回int數(shù)據(jù)個(gè)數(shù)
    {
    	return _a.size();
    }
    void Print()
    {
    	for (size_t i = 0; i < _a.size(); i++)
    	{
    		cout << _a[i] << " " << endl;
    	}
    	cout << endl;
    }
private:
	vector<size_t> _a;
	size_t _size;
};

void TestBitMap()
{
	BitMap bm(65);
	bm.Set(3);
	bm.Set(4);
	bm.Set(5);
	bm.Print();
	cout << "is 4 EXTST? " << bm.Test(4) << endl;
	cout << "is 8 EXTST? " << bm.Test(8) << endl;

	bm.Remove(4);
	cout << "is 4 EXTST? " << bm.Test(4) << endl;
	bm.Print();

	cout << "size: " << bm.Size() << endl;
	cout << "capacity: " << bm.Capacity() << endl;
	bm.Resize(100);
	cout << "capacity: " << bm.Capacity() << endl;
}

● Bloom Filter 是一種空間效率很高的隨機(jī)數(shù)據(jù)結(jié)構(gòu)，Bloom filter 可以看做是對(duì) bit-map 的擴(kuò)展, 它的原理是：

當(dāng)一個(gè)元素被加入集合時(shí)，通過 K 個(gè) Hash函數(shù)將這個(gè)元素映射成一個(gè)位陣列（Bit array）中的 K 個(gè)點(diǎn)，把它們置為 1。檢索時(shí)，只要看看這些點(diǎn)是不是都是 1 就知道集合中有沒有它了。

1、如果這些點(diǎn)有任何一個(gè) 0，則被檢索元素一定不在；

2、如果都是 1，則被檢索元素可能在。

用了素?cái)?shù)表和6個(gè)哈希算法：

#pragma

size_t _GetNextPrime(size_t size)//素?cái)?shù)表：獲取下一個(gè)素?cái)?shù)
{
	const int _PrimeSize = 28;
	static const unsigned long _PrimeList[_PrimeSize] =
	{
		53ul, 97ul, 193ul, 389ul, 769ul,
		1543ul, 3079ul, 6151ul, 12289ul, 24593ul,
		49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
		1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,
		50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,
		1610612741ul, 3221225473ul, 4294967291ul
	};
	for (size_t i = 0; i < _PrimeSize; ++i)
	{
		if (_PrimeList[i] > size)
		{
			return _PrimeList[i];
		}
		return _PrimeList[i - 1];
	}
	return _PrimeList[_PrimeSize];//如果size大于或等于素?cái)?shù)表中數(shù)據(jù)，就返回表中最大數(shù)
}
//6種字符串哈希算法
template<class T>
size_t BKDRHash(const T * str)
{
	register size_t hash = 0;
	while (size_t ch = (size_t)*str++)
	{
		hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313..         
	}
	return hash;
}
template<class T>
size_t SDBMHash(const T *str)
{
	register size_t hash = 0;
	while (size_t ch = (size_t)*str++)
	{
		hash = 65599 * hash + ch;
		//hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;  
	}
	return hash;
}
template<class T>
size_t RSHash(const T *str)
{
	register size_t hash = 0;
	size_t magic = 63689;
	while (size_t ch = (size_t)*str++)
	{
		hash = hash * magic + ch;
		magic *= 378551;
	}
	return hash;
}
template<class T>
size_t APHash(const T *str)
{
	register size_t hash = 0;
	size_t ch;
	for (long i = 0; ch = (size_t)*str++; i++)
	{
		if ((i & 1) == 0)
		{
			hash ^= ((hash << 7) ^ ch ^ (hash >> 3));
		}
		else
		{
			hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));
		}
	}
	return hash;
}
template<class T>
size_t JSHash(const T *str)
{
	if (!*str)        // 這是由本人添加，以保證空字符串返回哈希值0  
		return 0;
	register size_t hash = 1315423911;
	while (size_t ch = (size_t)*str++)
	{
		hash ^= ((hash << 5) + ch + (hash >> 2));
	}
	return hash;
}
template<class T>
size_t DEKHash(const T* str)
{
	if (!*str)        // 以保證空字符串返回哈希值0  
		return 0;
	register size_t hash = 1315423911;
	while (size_t ch = (size_t)*str++)
	{
		hash = ((hash << 5) ^ (hash >> 27)) ^ ch;
	}
	return hash;
}
//6個(gè)仿函數(shù)分別進(jìn)行6種字符串算法的調(diào)用
template<class T>
struct _HashFunc1
{
	size_t operator()(const T& str)
	{
		return BKDRHash(str.c_str());
	}
};
template<class T>
struct _HashFunc2
{
	size_t operator()(const T& str)
	{
		return SDBMHash(str.c_str());
	}
}; 
template<class T>
struct _HashFunc3
{
	size_t operator()(const T& str)
	{
		return RSHash(str.c_str());
	}
}; 
template<class T>
struct _HashFunc4
{
	size_t operator()(const T& str)
	{
		return APHash(str.c_str());
	}
}; 
template<class T>
struct _HashFunc5
{
	size_t operator()(const T& str)
	{
		return JSHash(str.c_str());
	}
};
template<class T>
struct _HashFunc6
{
	size_t operator()(const T& str)
	{
		return DEKHash(str.c_str());
	}
};

布隆過濾器具體實(shí)現(xiàn)如下：

#define _CRT_SECURE_NO_WARNINGS 1

template<class K = string, 
class HashFunc1 = _HashFunc1<K>, class HashFunc2 = _HashFunc2<K>, 
class HashFunc3 = _HashFunc3<K>, class HashFunc4 = _HashFunc4<K>, 
class HashFunc5 = _HashFunc5<K>, class HashFunc6 = _HashFunc6<K>>
class BloomFilter
{
public:
	BloomFilter(size_t size = 0)
	{
		_capacity = _GetNextPrime(size);
		_bm.Resize(_capacity);//調(diào)用BitMap的Resize調(diào)整大小
	}
	void Set(const K& key)
	{
		size_t index1 = HashFunc1()(key);
		size_t index2 = HashFunc2()(key);
		size_t index3 = HashFunc3()(key);
		size_t index4 = HashFunc4()(key);
		size_t index5 = HashFunc5()(key);
		size_t index6 = HashFunc6()(key); 

		_bm.Set(index1 % _capacity);//設(shè)置的位為index1 % _capacity，調(diào)用BitMap的Set
		_bm.Set(index2 % _capacity);
		_bm.Set(index3 % _capacity);
		_bm.Set(index4 % _capacity);
		_bm.Set(index5 % _capacity);
		_bm.Set(index6 % _capacity);
	}
	bool Test(const K& key)
	{
		//只要存在一個(gè)為0就不存在，否則存在
		size_t index1 = HashFunc1()(key);
		if (!_bm.Test(index1 % _capacity))
			return false;

		size_t index2 = HashFunc2()(key);
		if(!_bm.Test(index2 % _capacity))
			return false;

		size_t index3 = HashFunc3()(key);
		if(!_bm.Test(index3 % _capacity))
			return false;

		size_t index4 = HashFunc4()(key);
		if(!_bm.Test(index4 % _capacity))
			return false;

		size_t index5 = HashFunc5()(key);
		if(!_bm.Test(index5 % _capacity))
			return false;

		size_t index6 = HashFunc6()(key);
		if(!_bm.Test(index6 % _capacity))
			return false;
		return true;
	}
private:
	BitMap _bm;
	size_t _capacity;
};
void TestBloomFilter()
{
	BloomFilter<> bf(50);
	bf.Set("Scen");
	bf.Set("斯洛");
	bf.Set("https://blog.51cto.com/user_index.php?action=addblog_new&job=modify&tid=1773181");
	cout << "exist? " << bf.Test("Scen") << endl;
	cout << "exist? " << bf.Test("Scenluo") << endl;
	cout << "exist? " << bf.Test("斯洛") << endl;
	cout << "exist? " << bf.Test("https://blog.51cto.com/user_index.php?action=addblog_new&job=modify&tid=1773181") << endl;
	cout << "exist? " << bf.Test("https://blog.51cto.com/user_index.php?action=addblog_new&job=modify&tid=1773131") << endl;

}

布隆過濾器的缺陷：

1、誤算率（False Positive）是其中之一。

隨著存入的元素?cái)?shù)量增加，誤算率隨之增加。但是如果元素?cái)?shù)量太少，則使用散列表足矣。所以我們用多個(gè)哈希表去存儲(chǔ)一個(gè)數(shù)據(jù)。那么問題又來了，我們是多用一些呢，還是少用一些。如果多用哈希表的話，如上面的題，一個(gè)哈希就需要500M，那么放的越多是不是越占內(nèi)存啊。如果太少的話是不是誤算率就高啊，所以取個(gè)適中的。我的程序?qū)崿F(xiàn)是取了六個(gè)哈希表。

2、如果當(dāng)前位置為0肯定不存在，但是為1不一定存在。

一般布隆過濾器只支持設(shè)計(jì)，不支持刪除?？梢酝ㄟ^引用計(jì)數(shù)，但空間浪費(fèi)較大。

向AI問一下細(xì)節(jié)

推薦閱讀：

免責(zé)聲明：本站發(fā)布的內(nèi)容（圖片、視頻和文字）以原創(chuàng)、轉(zhuǎn)載和分享為主，文章觀點(diǎn)不代表本網(wǎng)站立場(chǎng)，如果涉及侵權(quán)請(qǐng)聯(lián)系站長(zhǎng)郵箱：is@yisu.com進(jìn)行舉報(bào)，并提供相關(guān)證據(jù)，一經(jīng)查實(shí)，將立刻刪除涉嫌侵權(quán)內(nèi)容。

上一篇新聞：
centos7創(chuàng)建ftp服務(wù)器
下一篇新聞：
DOM——遍歷.parent()、.parents()、.closest()和.next()

猜你喜歡

AI
助
手

產(chǎn)品服務(wù)

地區(qū)劃分

專題活動(dòng)

幫助支持

關(guān)于我們

售后咨詢

7*24小時(shí)在線電話：400-100-2938

7*24小時(shí)在線 QQ：800811969

關(guān)注億速云

億速云公眾號(hào)

手機(jī)網(wǎng)站二維碼

<kbd id="rgbjp"></kbd>