Sort by

recency

|

22 Discussions

|

  • + 0 comments

    This is a brilliant application of text classification using real-world Stack Exchange data, super insightful! For those of us managing multiple client websites especially for this one, especially with diverse content, models like this can help automate topic tagging, improve search accuracy, and streamline content categorization.

  • + 0 comments

    Leveraging tools like Naive Bayes classification, as suggested, is a great starting point, especially for beginners in machine learning. ekbet sign up

  • + 0 comments

    A random forest classifier probably works well on this data:

    https://github.com/angelgldh/HackerRank/blob/main/Artificial_Intelligence/stack_exchange_question_classifier/text_classifier_quora_topics.ipynb

  • + 0 comments
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.model_selection import train_test_split
    from sklearn.naive_bayes import MultinomialNB
    
    with open('training.json') as file:
        lines = file.readlines()
    lines = iter(lines)
    
    N = int(next(lines))
    
    X_train,classes = [],[]
    for _ in range(N):
        line = next(lines)
        d = eval(line)
        X_train.append(d['question'])
        classes.append(d['topic'].strip())
    
    classes_to_ix = {c:i for i,c in enumerate(set(classes))}
    ix_to_classes = {i:c for c,i in classes_to_ix.items()}
    y_train = [classes_to_ix[c] for c in classes]
    
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(X_train)
    
    clf = MultinomialNB(alpha=0.1)
    clf.fit(X_train, y_train)
    
    P = int(input())
    X_pred = []
    for _ in range(P):
        line = input()
        d = eval(line)
        X_pred.append(d['question'])
        
    X_pred = vectorizer.transform(X_pred)
    y_pred = [ix_to_classes[i] for i in clf.predict(X_pred)]
    print('\n'.join(y_pred))
    
  • + 0 comments

    import json,sys from sklearn.svm import LinearSVC

    from sklearn.ensemble import RandomForestClassifier

    from sklearn.feature_extraction.text import HashingVectorizer if sys.version_info[0]>=3: raw_input=input transformer=HashingVectorizer(stop_words='english')

    _train=[] train_label=[] f=open('training.json') for i in range(int(f.readline())): h=json.loads(f.readline()) _train.append(h['question']+"\r\n"+h['excerpt']) train_label.append(h['topic']) f.close() train = transformer.fit_transform(_train) svm=LinearSVC() svm.fit(train,train_label)

    _test=[] for i in range(int(raw_input())): h=json.loads(raw_input()) _test.append(h['question']+"\r\n"+h['excerpt']) test = transformer.transform(_test) test_label=svm.predict(test) for e in test_label: print(e)