您好,登錄后才能下訂單哦!
本篇內容介紹了“分析數據庫實現原理”的有關知識,在實際案例的操作過程中,不少人都會遇到這樣的困境,接下來就讓小編帶領大家學習一下如何處理這些情況吧!希望大家仔細閱讀,能夠學有所成!
Hash連接,如內存足夠,首先遍歷內表創建Hash表,然后遍歷外表,對連接鍵計算HashCode,如一致,則遍歷Hash表中具有同一HashCode的鏈表,值一致,則返回該值。
如內存不夠,可遍歷兩張表,使用同樣的Hash函數把表拆分為N個Hash“分區”,遍歷內表每一個Hash分區和外表相應的Hash分區,如找到與連接鍵值一致的數據,則返回該值。
詳見代碼注釋.
#include <stdio.h> #include <stdlib.h> #include "hash_join.h" #define MAX_ELEMENTS 1024 //生成hash code static int generate_hashcode(int n) { return n % HASH_BUCKET; } //生成hash桶(寫入到文件中,以文件的方式模擬) static int generate_bucket(FILE *file,char *tag) { printf("----------- generate_bucket ---------- \n"); //數組 char buf[MAX_BYTES]; FILE *fd = NULL; for(;!feof(file);) { int x = read_int(file,buf); if(x == 0) break; int hashcode = generate_hashcode(x); char filename[30]; sprintf(filename,"/cygdrive/d/tmp/hash/%s_%d.csv",tag,hashcode); //printf("Hash code is %d,Bucket filename is %s.\n",hashcode,filename); fd = fopen(filename,"a"); if(fd == NULL) { printf("Can not open file %s.\n",filename); return 0; } //寫入文件中 write_int(fd,x); fclose(fd); } return 1; } //把hash表加載到內存中,適用于內存足夠的情況 //使用二維數組模擬Hash表,D1 : hash桶,D2 : 桶中的數據 static int load_hashtable(int ht[][MAX_ELEMENTS]) { printf("----------- load_hashtable ---------- \n"); for(int i=0;i < HASH_BUCKET;i++) { //循環桶號 char filename[MAX_BYTES]; //讀文件 sprintf(filename,"/cygdrive/d/tmp/hash/inner_%d.csv",i); FILE *fd = fopen(filename,"r"); if(fd == NULL){ //printf("Can not open file : %s\n",filename); continue; } int j=0; char buf[MAX_BYTES]; for(;!feof(fd) && j < MAX_ELEMENTS;) { //把文件內容放到數組中 int x = read_int(fd,buf); ht[i][j++] = x; } fclose(fd); } return 1; } //使用內存創建hash表進行hash連接 static void hash_join_onmemory(FILE *outerfile,FILE *innerfile) { printf("----------- hash_join_onmemory ---------- \n"); int ht[HASH_BUCKET][MAX_ELEMENTS]; char buffer[MAX_BYTES]; int flag = 0; //創建hash bucket文件 flag = generate_bucket(innerfile,"inner"); if(!flag) { printf("Can not generate bucket file!\n"); return; } //加載到hash表中(二維數組模擬) flag = load_hashtable(ht); if(!flag) { printf("Can not load hash table!\n"); return; } //遍歷第二個文件,執行JOIN for(;!feof(outerfile);) { //讀第二個文件,執行join int outer = read_int(outerfile,buffer); //計算hashcode int hashcode = generate_hashcode(outer); for(int i=0;i < MAX_ELEMENTS;i++) { //遍歷hash桶中的數據,找到對應的數據 if(ht[hashcode][i] == outer) { printf("Found one,hash bucket is %d,value is : %d.\n",hashcode,outer); } } } } //使用磁盤緩存進行hash連接 static void hash_join_ondisk(FILE *outerfile,FILE *innerfile) { printf("----------- hash_join_ondisk ---------- \n"); char buffer[MAX_BYTES]; int flag = 0; //創建hash"桶"文件 flag = generate_bucket(innerfile,"inner"); if(!flag) { printf("Can not generate inner bucket file!\n"); return; } flag = generate_bucket(outerfile,"outer"); if(!flag) { printf("Can not generate outer bucket file!\n"); return; } //遍歷hash值相同的文件,執行連接 for(int i=0;i < HASH_BUCKET;i++) { //從0號桶開始 char innerfname[MAX_BYTES]; char outerfname[MAX_BYTES]; //讀文件 sprintf(innerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","inner",i); sprintf(outerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","outer",i); FILE *fd_inner = fopen(innerfname,"r"); if(fd_inner == NULL){ //printf("Can not open file : %s\n",filename); continue; } FILE *fd_outer = fopen(outerfname,"r"); if(fd_outer == NULL) { continue; } for(;!feof(fd_outer);) { int v_out = read_int(fd_outer,buffer); if(v_out == 0) continue; for(;!feof(fd_inner);) { int v_in = read_int(fd_inner,buffer); if(v_in == 0) continue; if(v_out == v_in) { printf("Found one,hash bucket is %d,value is : %d.\n",i,v_out); } } rewind(fd_inner); } } } //執行Hash連接 void hash_join(char *file1,char * file2,char *flag) { printf("----------- hash join ---------- \n"); FILE *outerfile = fopen(file1,"r"); if(outerfile == NULL) { printf("Can not open file %s.\n",file1); return; } //打開第二個文件 FILE *innerfile = fopen(file2,"r"); if(innerfile == NULL) { printf("Can not open file %s.\n",file2); return; } //執行JOIN if(strcmp(flag,"memory") == 0) hash_join_onmemory(outerfile,innerfile); else hash_join_ondisk(outerfile,innerfile); //關閉 fclose(outerfile); fclose(innerfile); }
運行輸出
$ cat file1.csv 1 2 3 4 5 1 234 2939 9002 20 $ cat file2.csv 11 20 3 40 55 50 234 33 90 1 $ /cygdrive/d/tmp/test.exe file1.csv file2.csv ------------- use memory ------------------ ----------- hash join ---------- ----------- hash_join_onmemory ---------- ----------- generate_bucket ---------- ----------- load_hashtable ---------- Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 3,value is : 3. Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 106,value is : 234. Found one,hash bucket is 20,value is : 20. ------------- use disk ------------------ ----------- hash join ---------- ----------- hash_join_ondisk ---------- ----------- generate_bucket ---------- ----------- generate_bucket ---------- Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 1,value is : 1. Found one,hash bucket is 3,value is : 3. Found one,hash bucket is 20,value is : 20. Found one,hash bucket is 106,value is : 234.
“分析數據庫實現原理”的內容就介紹到這里了,感謝大家的閱讀。如果想了解更多行業相關的知識可以關注億速云網站,小編將為大家輸出更多高質量的實用文章!
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。