Detect HTML links

  • + 0 comments

    For Python 3:

    import re
    A_TAG = re.compile(
        r'<a\s+[^>]*?href\s*=\s*' 
        r'([\'"])(.*?)\1'        
        r'[^>]*>'                
        r'(.*?)'                 
        r'</a>',                 
        flags=re.IGNORECASE | re.DOTALL
    )
    
    TAG_STRIP = re.compile(r'<[^>]+>')
    WS        = re.compile(r'\s+')
    
    def clean_text(raw):
        return WS.sub(' ', TAG_STRIP.sub('', raw)).strip()
    
    N = int(input())
    out_lines = []
    for line in range(N):
        line = input()
        for quote, url, txt in A_TAG.findall(line):
            out_lines.append(f"{url},{clean_text(txt)}")
    print("\n".join(out_lines))