Detect HTML Tags

  • + 0 comments

    C

    int cmp_str(const void *a, const void *b) { char *const *sa = a; char *const *sb = b; return strcmp(*sa, *sb); }

    int main() {

    const char *pattern = "<[:blank:]+?[/]?[A-Aa-z0-9]+[:blank:]+?";  // only digits
    const char *pattern_post = "[A-Za-z0-9]+";
    char buffer[MAX_INPUT];
    regex_t regex, regex_post;
    int result;
    char error_msg[128];
    
    // --- Get input from stdin---
    {
        size_t len = fread(buffer, 1, sizeof(buffer) - 1, stdin);
        buffer[len] = '\0';  // null-terminate
    }
    
    // --- Compile regex ---
    result = regcomp(&regex, pattern, REG_EXTENDED);
    if (result != 0) {
        regerror(result, &regex, error_msg, sizeof(error_msg));
        printf("Regex compile error: %s\n", error_msg);
        return 1;
    }
    
    result = regcomp(&regex_post, pattern_post, REG_EXTENDED);
    if (result != 0) {
        regerror(result, &regex, error_msg, sizeof(error_msg));
        printf("Regex compile error: %s\n", error_msg);
        return 1;
    }
    
    // --- Execute regex ---
    // Loop until no more matches
    const char *ptr = buffer;
    regmatch_t match[1];  // store one match at a time
    regmatch_t match_post[1];  // store one match at a time
    char hit[MAX_OUTPUT];
    char match_out[MAX_OUTPUT];
    memset(match_out,0,sizeof(match_out));
    memset(hit,0,sizeof(hit));
    
    while (regexec(&regex, ptr, 1, match, 0) == 0) {
    
        int start = match[0].rm_so;
        int end   = match[0].rm_eo;
        int len   = end - start;
    
        memset(hit,0,sizeof(hit));
        memcpy(hit,ptr+start,len);
    
        if(regexec(&regex_post, hit, 1, match_post, 0) == 0){
            int start_post = match_post[0].rm_so;
            int end_post   = match_post[0].rm_eo;
            int len_post   = end - start;
            strncat(match_out,hit+start_post,len_post);
            strcat(match_out,";");
        }
        // Move the pointer forward to search next part
        ptr += end;
    }
    
    //sort
    {
        char *tokens[100];  // store pointers to words
        int count = 0;
    
        // Split string by ';'
        char *token = strtok(match_out, ";");
        while (token != NULL) {
            tokens[count++] = token;
            token = strtok(NULL, ";");
        }
    
        // Sort words
        qsort(tokens, count, sizeof(char *), cmp_str);
    
        // Print back in sorted order, separated by ';'
        for (int i = 0; i < count; i++) {
            if (i == 0 || strcmp(tokens[i], tokens[i - 1]) != 0) {
                if(i!= 0){printf(";");}
                printf("%s", tokens[i]);
            }
        }
        printf("\n");
    }
    
    // --- Free compiled regex ---
    regfree(&regex);
    return 0;
    

    }