Discussion on Similar Strings Challenge

8 years ago+ 0 comments

In my solution I didn't end up relying on sorting, only on hashing, to help reduce the set of string positions that I need to examine. Then I do a full, expensive verify for each of those positions. It passes the tests but just barely. Room for improvement.

#include <vector>
#include <iostream>
#include <unordered_map>
using namespace std;

// Two subranges that are similar will always give same hash
int hash_subrange_similar(const string& s, int start, int len) {
    unsigned long hash = 5381;
    int similar_map[11] = {0};
    int count = 1;
    for (int i=start; i < start + len; i++) {
        int c = s[i];
        if (similar_map[c] == 0) {
            similar_map[c] = count;
            count++;
        }
        hash = hash * similar_map[c] + 33;
    }
    return (int)hash;
}

// Test if s[i..i+len] and s[j..j+len] are similar substrings
bool are_similar(const string& s, int i, int j, int len) {
    int similar_map_i[11] = {0};
    int similar_map_j[11] = {0};
    for (int k=0; k < len; k++) {
        int ci = s[i + k];
        int cj = s[j + k];
        if (similar_map_i[ci] == 0 && similar_map_j[cj] == 0) {
            similar_map_i[ci] = cj;
            similar_map_j[cj] = ci;
        } else if (similar_map_i[ci] != cj || similar_map_j[cj] != ci) {
            return false;
        }
    }
    return true;
}

int main() {
    int s_len, num_tests;
    cin >> s_len >> num_tests;
    string s;
    cin >> s;
    
    // Reduce characters to 1-10 range (reserve 0 for "unknown")
    for (int i=0; i < s.length(); i++) {
        s[i] = s[i] - 'a' + 1;
    }
    
    // To speed things up we start by creating an index:
    // we hash the first 12 characters at each position and
    // then track for each hash which positions match that hash.
    const int hash_length = 12;
    unordered_map<int, vector<int>> index;
    for (int i=0; i <= (int)s.length() - hash_length; i++) {
        int hash = hash_subrange_similar(s, i, hash_length);
        index[hash].push_back(i);
    }
    
    for (int i=0; i < num_tests; i++) {
        int start, end;
        cin >> start >> end;
        int query_start = start - 1; // start is 1-based
        int query_len = end - start + 1; // start/end are inclusive
                
        int count = 0;
        if (query_len >= hash_length) {
            // Hash first 12 chars of query then use index to get possible matching positions
            int hash = hash_subrange_similar(s, query_start, hash_length);
            for (int j : index[hash]) {
                if (j <= s.length() - query_len) {
                    if (are_similar(s, query_start, j, query_len)) {
                        count++;
                    }
                }
            }
        } else {
            // Brute force for short queries
            for (int j=0; j <= s.length() - query_len; j++) {
                if (are_similar(s, query_start, j, query_len)) {
                    count++;
                }
            }
        }
        cout << count << endl;
    }
    
    return 0;
}

Cookie support is required to access HackerRank