Detect HTML Tags, Attributes and Attribute Values

  • + 0 comments
    import re
    import sys
    
    N = int(input())
    text = sys.stdin.read()
    
    text = """<head>
    <title>HTML</title>
    </head>
    <object type="application/x-flash" 
      data="your-file.swf" 
      width="0" height="0">
      <!-- <param name="movie" value="your-file.swf" /> -->
      <param name="quality" value="high"/>
    </object>"""
    
    regex_endtag = r'(?<=<)([a-zA-Z0-9]+)([\s\S]*?)/?>'
    regex_comment = r'\<!--.*?-\>'
    
    processed_input = re.sub(regex_comment, '', text)
    
    
    header = []
    for tag in re.findall(regex_endtag, processed_input):
        header.append(tag)
    
    cleaned_list = [(key, value.replace('\n', '').replace('"', '').strip()) for key, value in header]
    
    cleaned_list_final = [
        (key,) if value == '' else (key, value) 
        for key, value in cleaned_list
    ]
    
    def print_formatted_list(items):
        for item in items:
            tag = item[0]
            print(tag)
            if len(item) > 1:
                attributes = item[1].split()
                for attribute in attributes:
                    key, value = attribute.split('=')
                    print(f'-> {key} > {value}')
    
    print_formatted_list(cleaned_list_final)