Build a Stack Exchange Scraper

Sort by

recency

|

108 Discussions

|

  • + 0 comments
    import re
    
    regex = r'href=\"\/questions\/(\d+).*?class=\"question-hyperlink\">([^<]+)<.*?class="relativetime">([^<]+)<'
    
    singleline = ""
    
    try:
        while True:
            line = input()
            singleline += line
    except EOFError:
        ...
        
    matches = re.findall(regex, singleline)
    
    for ans in matches:
        print(';'.join(ans))
    
  • + 0 comments
    import re
    import sys
    t=sys.stdin.read()
    a=re.findall(r"(?<=questions\/)\d+(?=\/)",t)
    b=re.findall(r"(?<=hyperlink\"\>).+(?=\<\/a\>)",t)
    c=re.findall(r"(?<=relativetime\"\>).+(?=\<)",t)
    l=list()
    for i in range(len(a)):
        l.append(a[i])
        l.append(b[i])
        l.append(c[i])
        print(";".join(l))
        l=[]
        
    
  • + 0 comments

    My solution (Python):

    # Enter your code here. Read input from STDIN. Print output to STDOUT
    import re
    import sys
    
    # txt1 = input() 
    txt1 = sys.stdin.read()
    matches1 = re.findall(
        r'<a href="\/questions\/(\d+)\/.+?\>+?([^<]+)(?:<\/a><\/h3>)',
        txt1,
        re.S
    )
    matches2 = re.findall(
        r'asked.?<(?:[^>])+>([^<]+)<',
        txt1,
        re.S
    )
    for i, j in zip(matches1, matches2):
        print(f"{i[0]};{i[1]};{j}")
    
  • + 0 comments

    Javascript: //(.?) --> It will stop until the and characters else . will become greedy and match all the content without stopping.

    // /gs --> The dotAll flag(s) changes the behavior of the. (dot) metacharacter in the regular expression. by default . matches any charactes in the regex except newLine characters \n by adding /s dotAll flag . will consider the newline \n characters as well/gs.

    const splitInput = input.split('class="question-summary"'); 
    if (splitInput && splitInput.length) {
         splitInput.forEach(question => {
            const regexContent = /.*question-summary-([0-9]+).*?class="question-hyperlink">(.*?)<\/a\>.*class="relativetime">(.*?)<\/span>.*/gs;
            const replaceContentexec = regexContent.exec(question);
            if (replaceContentexec) {
                replaceContentexec.shift();
                console.log(replaceContentexec.join(';'));   
            }
         })   
    }
    
  • + 0 comments

    Bash

    #!/bin/bash 
    
    readarray myArray 
    
    for line in "${myArray[@]}"; do
    
    if [[ $line =~ question ]] || [[ $line =~ relativetime ]]; then
    echo "$line" | grep -E -o '(questions\/\d+\/)|(question-hyperlink">.+<\/a>)|(relativetime">.+<)'| 
    tr -d '\n' | 
    sed 's/question-hyperlink">/;/; s/<\/a>/;/; s/relativetime">//; s/</\n/; s/questions\///; s/\///'   
    fi
    done