22_string_algorithm.c

Download
c 468 lines 13.2 KB
  1/*
  2 * 문자열 알고리즘 (String Algorithms)
  3 * KMP, Rabin-Karp, Z-알고리즘, 매나커
  4 *
  5 * 패턴 매칭과 문자열 처리 알고리즘입니다.
  6 */
  7
  8#include <stdio.h>
  9#include <stdlib.h>
 10#include <string.h>
 11
 12#define MAX_LEN 100001
 13#define MOD 1000000007
 14#define BASE 31
 15
 16/* =============================================================================
 17 * 1. KMP (Knuth-Morris-Pratt)
 18 * ============================================================================= */
 19
 20/* 실패 함수 (부분 일치 테이블) 계산 */
 21int* compute_failure(const char* pattern) {
 22    int m = strlen(pattern);
 23    int* fail = calloc(m, sizeof(int));
 24
 25    int j = 0;
 26    for (int i = 1; i < m; i++) {
 27        while (j > 0 && pattern[i] != pattern[j]) {
 28            j = fail[j - 1];
 29        }
 30        if (pattern[i] == pattern[j]) {
 31            fail[i] = ++j;
 32        }
 33    }
 34
 35    return fail;
 36}
 37
 38/* KMP 검색 - 모든 매칭 위치 반환 */
 39int* kmp_search(const char* text, const char* pattern, int* match_count) {
 40    int n = strlen(text);
 41    int m = strlen(pattern);
 42    int* fail = compute_failure(pattern);
 43    int* matches = malloc(n * sizeof(int));
 44    *match_count = 0;
 45
 46    int j = 0;
 47    for (int i = 0; i < n; i++) {
 48        while (j > 0 && text[i] != pattern[j]) {
 49            j = fail[j - 1];
 50        }
 51        if (text[i] == pattern[j]) {
 52            j++;
 53            if (j == m) {
 54                matches[(*match_count)++] = i - m + 1;
 55                j = fail[j - 1];
 56            }
 57        }
 58    }
 59
 60    free(fail);
 61    return matches;
 62}
 63
 64/* =============================================================================
 65 * 2. Rabin-Karp
 66 * ============================================================================= */
 67
 68long long compute_hash(const char* s, int len) {
 69    long long hash = 0;
 70    long long power = 1;
 71    for (int i = 0; i < len; i++) {
 72        hash = (hash + (s[i] - 'a' + 1) * power) % MOD;
 73        power = (power * BASE) % MOD;
 74    }
 75    return hash;
 76}
 77
 78int* rabin_karp(const char* text, const char* pattern, int* match_count) {
 79    int n = strlen(text);
 80    int m = strlen(pattern);
 81    int* matches = malloc(n * sizeof(int));
 82    *match_count = 0;
 83
 84    if (m > n) return matches;
 85
 86    long long pattern_hash = compute_hash(pattern, m);
 87    long long text_hash = compute_hash(text, m);
 88    long long power = 1;
 89
 90    for (int i = 0; i < m - 1; i++) {
 91        power = (power * BASE) % MOD;
 92    }
 93
 94    for (int i = 0; i <= n - m; i++) {
 95        if (text_hash == pattern_hash) {
 96            /* 해시 충돌 확인 */
 97            int match = 1;
 98            for (int j = 0; j < m; j++) {
 99                if (text[i + j] != pattern[j]) {
100                    match = 0;
101                    break;
102                }
103            }
104            if (match) matches[(*match_count)++] = i;
105        }
106
107        if (i < n - m) {
108            /* 롤링 해시 */
109            text_hash = (text_hash - (text[i] - 'a' + 1) + MOD) % MOD;
110            text_hash = (text_hash * mod_inv(BASE, MOD)) % MOD;
111            text_hash = (text_hash + (text[i + m] - 'a' + 1) * power) % MOD;
112        }
113    }
114
115    return matches;
116}
117
118/* 모듈러 역원 (간단 버전) */
119long long mod_inv(long long a, long long mod) {
120    long long result = 1;
121    long long exp = mod - 2;
122    while (exp > 0) {
123        if (exp & 1) result = (result * a) % mod;
124        a = (a * a) % mod;
125        exp >>= 1;
126    }
127    return result;
128}
129
130/* 간단한 Rabin-Karp (롤링 해시 개선) */
131int* rabin_karp_simple(const char* text, const char* pattern, int* match_count) {
132    int n = strlen(text);
133    int m = strlen(pattern);
134    int* matches = malloc(n * sizeof(int));
135    *match_count = 0;
136
137    if (m > n) return matches;
138
139    long long pattern_hash = 0;
140    long long text_hash = 0;
141    long long power = 1;
142
143    /* 초기 해시 계산 */
144    for (int i = 0; i < m; i++) {
145        pattern_hash = (pattern_hash * BASE + pattern[i]) % MOD;
146        text_hash = (text_hash * BASE + text[i]) % MOD;
147        if (i < m - 1) power = (power * BASE) % MOD;
148    }
149
150    for (int i = 0; i <= n - m; i++) {
151        if (text_hash == pattern_hash) {
152            if (strncmp(text + i, pattern, m) == 0) {
153                matches[(*match_count)++] = i;
154            }
155        }
156        if (i < n - m) {
157            text_hash = ((text_hash - text[i] * power % MOD + MOD) * BASE + text[i + m]) % MOD;
158        }
159    }
160
161    return matches;
162}
163
164/* =============================================================================
165 * 3. Z-알고리즘
166 * ============================================================================= */
167
168int* z_function(const char* s) {
169    int n = strlen(s);
170    int* z = calloc(n, sizeof(int));
171    int l = 0, r = 0;
172
173    for (int i = 1; i < n; i++) {
174        if (i < r) {
175            z[i] = (z[i - l] < r - i) ? z[i - l] : r - i;
176        }
177        while (i + z[i] < n && s[z[i]] == s[i + z[i]]) {
178            z[i]++;
179        }
180        if (i + z[i] > r) {
181            l = i;
182            r = i + z[i];
183        }
184    }
185
186    return z;
187}
188
189/* Z-알고리즘으로 패턴 매칭 */
190int* z_search(const char* text, const char* pattern, int* match_count) {
191    int n = strlen(text);
192    int m = strlen(pattern);
193
194    /* pattern + "$" + text 연결 */
195    char* combined = malloc(n + m + 2);
196    sprintf(combined, "%s$%s", pattern, text);
197
198    int* z = z_function(combined);
199    int* matches = malloc(n * sizeof(int));
200    *match_count = 0;
201
202    for (int i = m + 1; i < n + m + 1; i++) {
203        if (z[i] == m) {
204            matches[(*match_count)++] = i - m - 1;
205        }
206    }
207
208    free(combined);
209    free(z);
210    return matches;
211}
212
213/* =============================================================================
214 * 4. 매나커 알고리즘 (Manacher)
215 * ============================================================================= */
216
217/* 가장 긴 팰린드롬 부분문자열 */
218char* manacher(const char* s) {
219    int n = strlen(s);
220    if (n == 0) return strdup("");
221
222    /* 문자 사이에 # 삽입 */
223    int len = 2 * n + 1;
224    char* t = malloc(len + 1);
225    for (int i = 0; i < n; i++) {
226        t[2 * i] = '#';
227        t[2 * i + 1] = s[i];
228    }
229    t[len - 1] = '#';
230    t[len] = '\0';
231
232    int* p = calloc(len, sizeof(int));
233    int center = 0, right = 0;
234    int max_len = 0, max_center = 0;
235
236    for (int i = 0; i < len; i++) {
237        if (i < right) {
238            int mirror = 2 * center - i;
239            p[i] = (p[mirror] < right - i) ? p[mirror] : right - i;
240        }
241
242        while (i - p[i] - 1 >= 0 && i + p[i] + 1 < len &&
243               t[i - p[i] - 1] == t[i + p[i] + 1]) {
244            p[i]++;
245        }
246
247        if (i + p[i] > right) {
248            center = i;
249            right = i + p[i];
250        }
251
252        if (p[i] > max_len) {
253            max_len = p[i];
254            max_center = i;
255        }
256    }
257
258    /* 원본 문자열에서 추출 */
259    int start = (max_center - max_len) / 2;
260    char* result = malloc(max_len + 1);
261    strncpy(result, s + start, max_len);
262    result[max_len] = '\0';
263
264    free(t);
265    free(p);
266    return result;
267}
268
269/* =============================================================================
270 * 5. 문자열 해싱
271 * ============================================================================= */
272
273typedef struct {
274    long long* prefix_hash;
275    long long* power;
276    int len;
277} StringHash;
278
279StringHash* create_string_hash(const char* s) {
280    int n = strlen(s);
281    StringHash* sh = malloc(sizeof(StringHash));
282    sh->prefix_hash = malloc((n + 1) * sizeof(long long));
283    sh->power = malloc((n + 1) * sizeof(long long));
284    sh->len = n;
285
286    sh->prefix_hash[0] = 0;
287    sh->power[0] = 1;
288
289    for (int i = 0; i < n; i++) {
290        sh->prefix_hash[i + 1] = (sh->prefix_hash[i] * BASE + s[i]) % MOD;
291        sh->power[i + 1] = (sh->power[i] * BASE) % MOD;
292    }
293
294    return sh;
295}
296
297long long get_hash(StringHash* sh, int l, int r) {
298    long long hash = (sh->prefix_hash[r + 1] -
299                      sh->prefix_hash[l] * sh->power[r - l + 1] % MOD + MOD) % MOD;
300    return hash;
301}
302
303void free_string_hash(StringHash* sh) {
304    free(sh->prefix_hash);
305    free(sh->power);
306    free(sh);
307}
308
309/* =============================================================================
310 * 6. 유용한 문자열 함수들
311 * ============================================================================= */
312
313/* 접미사 배열 (간단 버전, O(n² log n)) */
314int* suffix_array_simple(const char* s) {
315    int n = strlen(s);
316    int* sa = malloc(n * sizeof(int));
317    for (int i = 0; i < n; i++) sa[i] = i;
318
319    /* 정렬 */
320    for (int i = 0; i < n - 1; i++) {
321        for (int j = i + 1; j < n; j++) {
322            if (strcmp(s + sa[i], s + sa[j]) > 0) {
323                int temp = sa[i];
324                sa[i] = sa[j];
325                sa[j] = temp;
326            }
327        }
328    }
329
330    return sa;
331}
332
333/* LCP 배열 (Kasai 알고리즘) */
334int* lcp_array(const char* s, int* sa) {
335    int n = strlen(s);
336    int* rank = malloc(n * sizeof(int));
337    int* lcp = malloc(n * sizeof(int));
338
339    for (int i = 0; i < n; i++) rank[sa[i]] = i;
340
341    int k = 0;
342    for (int i = 0; i < n; i++) {
343        if (rank[i] == 0) {
344            lcp[0] = 0;
345            continue;
346        }
347        int j = sa[rank[i] - 1];
348        while (i + k < n && j + k < n && s[i + k] == s[j + k]) k++;
349        lcp[rank[i]] = k;
350        if (k > 0) k--;
351    }
352
353    free(rank);
354    return lcp;
355}
356
357/* =============================================================================
358 * 테스트
359 * ============================================================================= */
360
361int main(void) {
362    printf("============================================================\n");
363    printf("문자열 알고리즘 예제\n");
364    printf("============================================================\n");
365
366    /* 1. KMP */
367    printf("\n[1] KMP 알고리즘\n");
368    const char* text1 = "ABABDABACDABABCABAB";
369    const char* pattern1 = "ABABCABAB";
370    int match_count;
371    int* matches = kmp_search(text1, pattern1, &match_count);
372    printf("    텍스트: %s\n", text1);
373    printf("    패턴: %s\n", pattern1);
374    printf("    매칭 위치 (%d개): ", match_count);
375    for (int i = 0; i < match_count; i++) printf("%d ", matches[i]);
376    printf("\n");
377    free(matches);
378
379    /* 실패 함수 시각화 */
380    const char* pattern2 = "ABAABAB";
381    int* fail = compute_failure(pattern2);
382    printf("    패턴 \"%s\"의 실패 함수: ", pattern2);
383    for (int i = 0; i < (int)strlen(pattern2); i++) printf("%d ", fail[i]);
384    printf("\n");
385    free(fail);
386
387    /* 2. Z-알고리즘 */
388    printf("\n[2] Z-알고리즘\n");
389    const char* str = "aabxaabxcaabxaabxay";
390    int* z = z_function(str);
391    printf("    문자열: %s\n", str);
392    printf("    Z-배열: ");
393    for (int i = 0; i < (int)strlen(str); i++) printf("%d ", z[i]);
394    printf("\n");
395    free(z);
396
397    matches = z_search(text1, pattern1, &match_count);
398    printf("    Z-알고리즘 매칭 위치: ");
399    for (int i = 0; i < match_count; i++) printf("%d ", matches[i]);
400    printf("\n");
401    free(matches);
402
403    /* 3. Rabin-Karp */
404    printf("\n[3] Rabin-Karp\n");
405    const char* text2 = "abracadabra";
406    const char* pattern3 = "abra";
407    matches = rabin_karp_simple(text2, pattern3, &match_count);
408    printf("    텍스트: %s\n", text2);
409    printf("    패턴: %s\n", pattern3);
410    printf("    매칭 위치: ");
411    for (int i = 0; i < match_count; i++) printf("%d ", matches[i]);
412    printf("\n");
413    free(matches);
414
415    /* 4. 매나커 */
416    printf("\n[4] 매나커 알고리즘\n");
417    const char* str2 = "babad";
418    char* palindrome = manacher(str2);
419    printf("    문자열: %s\n", str2);
420    printf("    가장 긴 팰린드롬: %s\n", palindrome);
421    free(palindrome);
422
423    const char* str3 = "forgeeksskeegfor";
424    palindrome = manacher(str3);
425    printf("    문자열: %s\n", str3);
426    printf("    가장 긴 팰린드롬: %s\n", palindrome);
427    free(palindrome);
428
429    /* 5. 문자열 해싱 */
430    printf("\n[5] 문자열 해싱\n");
431    const char* str4 = "abcabc";
432    StringHash* sh = create_string_hash(str4);
433    printf("    문자열: %s\n", str4);
434    printf("    hash[0:2] = %lld\n", get_hash(sh, 0, 2));
435    printf("    hash[3:5] = %lld\n", get_hash(sh, 3, 5));
436    printf("    동일 여부: %s\n",
437           get_hash(sh, 0, 2) == get_hash(sh, 3, 5) ? "예" : "아니오");
438    free_string_hash(sh);
439
440    /* 6. 접미사 배열 */
441    printf("\n[6] 접미사 배열\n");
442    const char* str5 = "banana";
443    int* sa = suffix_array_simple(str5);
444    printf("    문자열: %s\n", str5);
445    printf("    접미사 배열: ");
446    for (int i = 0; i < (int)strlen(str5); i++) printf("%d ", sa[i]);
447    printf("\n");
448    printf("    정렬된 접미사:\n");
449    for (int i = 0; i < (int)strlen(str5); i++) {
450        printf("      %d: %s\n", sa[i], str5 + sa[i]);
451    }
452    free(sa);
453
454    /* 7. 복잡도 */
455    printf("\n[7] 복잡도\n");
456    printf("    | 알고리즘        | 전처리    | 검색      |\n");
457    printf("    |-----------------|-----------|----------|\n");
458    printf("    | KMP             | O(m)      | O(n)     |\n");
459    printf("    | Rabin-Karp      | O(m)      | O(n+m)*  |\n");
460    printf("    | Z-알고리즘      | O(n+m)    | -        |\n");
461    printf("    | 매나커          | -         | O(n)     |\n");
462    printf("    * 평균, 최악 O(nm)\n");
463
464    printf("\n============================================================\n");
465
466    return 0;
467}