// 利用经典的大数据处理算法bloomfilter进行两个集合中相同元素的查找,去重
#include
#include
unsigned char mask[8] = {128, 64, 32, 16, 8, 4, 2, 1};
// 简单的哈希算法
int hashfuc(char* s, int key)
{
int i, seed[4] = {5, 7 ,11, 13}, value = 0;
if(key >= 4) key %= 4;
for(i = 0; s[i]; i++)
value += s[i]*seed[key];
return value;
}
// 利用bloomfilter算法将字符串s映射到位数组m中,并去掉重复的子串
void bloomfilter(unsigned char *m, char *s)
{
int i, j, hvalue, brepeat;
char substr[32];
for(i = j = 0; ; i++) {
if(s[i] != ' ' && s[i] != '\t' && s[i] != 0)
substr[j++] = s[i];
else {
substr[j] = 0;
brepeat = 1;
for(j = 0; j < 4; j++) {
hvalue = hashfuc(substr, j) & 0X7F;
if((m[hvalue>>3] & mask[hvalue&7]) == 0) {
m[hvalue>>3] |= mask[hvalue&7];
brepeat = 0;
}
}
// 如果是重复子串
if(brepeat == 1) {
j = strlen(substr);
strncpy(s+i-j, s+i+1, strlen(s)-i);
//printf("有重复子串%s, 去重后是%s\n", substr, s);
i = i - j - 1;
}
if(s[i] == 0) break;
j = 0;
}
}
}
int main()
{
char s1[256], s2[256], substr[32];
int i, j, hvalue;
unsigned char m1[16]={0}, m2[16]={0}, m3[16];
printf("First string\n");
gets(s1);
printf("Second string\n");
gets(s2);
bloomfilter(m1, s1);
bloomfilter(m2, s2);
// 求两个位数组的交集,子串能够映射到交集说明是相同的子串
for(i = 0; i < 16; i ++) {
m3[i] = m1[i] & m2[i];
//printf("%02x %02x %02x\n", m1[i], m2[i], m3[i]);
}
printf("\n相同的子串有:");
j = 0;
// 只需要对一个字符串进行映射,只要能映射到交集的就是子串
for(i = 0; ; i++) {
if(s1[i] != ' ' && s1[i] != '\t' && s1[i] != 0)
substr[j++] = s1[i];
else {
substr[j] = 0;
for(j = 0; j < 4; j++) {
hvalue = hashfuc(substr, j) & 0X7F;
if((m3[hvalue>>3] & mask[hvalue&7]) == 0) break;
}
if(j == 4) printf("%s ", substr);
if(s1[i] == 0) break;
j = 0;
}
}
printf("\n");
}