”`python
import re
from bs4 import BeautifulSoup, NavigableString, Tag
import json

def generate_id(text):
”””Generates a URL-friendly ID from a given text string.”””
text = text.lower()
text = text.replace(’ä’, ’a’).replace(’ö’, ’o’).replace(’å’, ’a’)
text = re.sub(r'[^\w\s-]’, ”, text) # Remove non-word chars except spaces and hyphens
text = re.sub(r'[\s_]+’, ’-’, text) # Replace spaces and underscores with single hyphen
text = text.strip(’-’) # Remove leading/trailing hyphens
return text

def clean_html_content(html_content):
”””
Cleans up WordPress HTML content according to specified rules.
”””
# Step 0: Pre-process WPBakery shortcodes to a more standard HTML-like structure
# This is a simplification; a full WPBakery parser would be more robust.
# Replace

with simple

.
processed_html = html_content.replace(’

’, ’

’).replace(’

’, ’

’)
processed_html = processed_html.replace(’

’, ’

’).replace(’

’, ’

’)
# Use a placeholder class to identify vc_column_text divs for later attribute removal
processed_html = re.sub(r’\

]*)\]’, r’

’, processed_html)
processed_html = processed_html.replace(’

’, ’

’)

# Unescape HTML entities for attributes (e.g., ” for ”)
processed_html = processed_html.replace(’”’, ’”’).replace(’″’, ’”’)

# Remove WPBakery shortcodes for images as per rules (no valid URLs provided in source HTML)
processed_html = re.sub(r’\[vc_single_image[^\]]*\]’, ”, processed_html)
processed_html = re.sub(r’\[vc_media_grid[^\]]*\]’, ”, processed_html)

soup = BeautifulSoup(processed_html, ’html.parser’)

# Task 1 – CLEAN HTML FORMATTING

# 1.1 Remove all Elementor/WPBakery classes, inline styles, and specific attributes
for tag in soup.find_all(True):
attrs_to_remove = [] for attr in tag.attrs:
if attr.startswith((’elementor-’, ’vc_’)) or attr in [’class’, ’style’, ’gap’, ’width’, ’alignment’, ’img_size’, ’onclick’, ’include’, ’grid_id’]:
attrs_to_remove.append(attr)
for attr in attrs_to_remove:
del tag.attrs[attr]

# 1.2 Remove H1 tags -> convert to H2
for h1_tag in soup.find_all(’h1’):
h1_tag.name = ’h2’

# 1.3 Remove empty tags and unnecessary nesting recursively
def remove_empty_tags_and_unwrap_recursively(current_soup):
changed = True
while changed:
changed = False
for tag in current_soup.find_all(True):
# Remove empty tags that are not self-closing and don’t contain significant content
if not tag.contents and not tag.is_empty_element and tag.name not in [’img’, ’br’, ’hr’, ’iframe’]:
tag.decompose()
changed = True
continue

# Unwrap divs that have no attributes and contain only one child or only whitespace
if tag.name == ’div’ and not tag.attrs:
if len(tag.contents) == 1 and isinstance(tag.contents[0], Tag):
if tag.contents[0].name not in [’div’, ’p’, ’ul’, ’ol’, ’table’]: # Avoid unwrapping content that needs parent block
tag.unwrap()
changed = True
elif not tag.get_text(strip=True) and not any(isinstance(c, Tag) for c in tag.contents): # Only whitespace or empty
tag.decompose()
changed = True
elif all(isinstance(c, NavigableString) and not c.strip() for c in tag.contents): # Only whitespace NavigableStrings
tag.decompose()
changed = True
return current_soup

soup = remove_empty_tags_and_unwrap_recursively(soup)

# Re-structure loose text content into paragraphs
final_body_contents = [] current_p_text = []

for content in soup.body.contents:
if isinstance(content, NavigableString) and content.strip():
current_p_text.append(content.strip())
elif isinstance(content, Tag):
if current_p_text:
new_p = soup.new_tag(”p”)
new_p.string = ” ”.join(current_p_text)
final_body_contents.append(new_p)
current_p_text = [] final_body_contents.append(content)
else: # Handle whitespace NavigableString
pass

if current_p_text:
new_p = soup.new_tag(”p”)
new_p.string = ” ”.join(current_p_text)
final_body_contents.append(new_p)

# Replace body contents with the newly structured list
soup.body.clear()
for item in final_body_contents:
soup.body.append(item)

# Create a new soup for the final output structure (summary, ToC, then content)
final_output_soup = BeautifulSoup(””, ”html.parser”)

# Task 3 – ADD SUMMARY AND KEY TAKEAWAYS (two-column layout)
summary_text = ”Tämä sivu esittelee erilaisia kattoremonttiprojekteja kuvagallerian muodossa. Se tarjoaa esimerkkejä huopakattoremonteista, piipunpellityksistä ja vesikourujen asennuksista, näyttäen työn kulkua ennen ja jälkeen -kuvien avulla.”
# Article text content is very short (under 300 words), so Avainkohdat will be skipped.

article_intro_div = final_output_soup.new_tag(”div”, class_=”article-intro”)
article_summary_div = final_output_soup.new_tag(”div”, class_=”article-summary”)
h2_summary = final_output_soup.new_tag(”h2”)
h2_summary.append(”📋 Tiivistelmä”)
article_summary_div.append(h2_summary)
p_summary = final_output_soup.new_tag(”p”)
p_summary.append(summary_text)
article_summary_div.append(p_summary)
article_intro_div.append(article_summary_div)
# Avainkohdat section is intentionally omitted as per rules for short articles.

final_output_soup.append(article_intro_div)

# Task 4 & 5 – ADD TABLE OF CONTENTS AND IDS TO HEADINGS
toc_data = [] headings = soup.find_all([’h2’, ’h3’])
previous_h2 = None

for heading in headings:
heading_text = heading.get_text().strip()
if not heading_text: # Skip empty headings
continue

heading_id = generate_id(heading_text)
heading[’id’] = heading_id

if heading.name == ’h2’:
current_h2_entry = {’text’: heading_text, ’id’: heading_id, ’subheadings’: []}
toc_data.append(current_h2_entry)
previous_h2 = current_h2_entry
elif heading.name == ’h3’ and previous_h2:
previous_h2[’subheadings’].append({’text’: heading_text, ’id’: heading_id})
elif heading.name == ’h3’ and not previous_h2: # Handle H3s that might appear before any H2
toc_data.append({’text’: heading_text, ’id’: heading_id, ’subheadings’: []})

if toc_data:
article_toc_div = final_output_soup.new_tag(”div”, class_=”article-toc”)
h2_toc = final_output_soup.new_tag(”h2”)
h2_toc.append(”📑 Sisällysluettelo”)
article_toc_div.append(h2_toc)
ul_toc = final_output_soup.new_tag(”ul”)

for entry in toc_data:
li_h2 = final_output_soup.new_tag(”li”)
a_h2 = final_output_soup.new_tag(”a”, href=”#” + entry[’id’])
a_h2.append(entry[’text’])
li_h2.append(a_h2)

if entry[’subheadings’]:
ul_h3 = final_output_soup.new_tag(”ul”)
for sub_entry in entry[’subheadings’]:
li_h3 = final_output_soup.new_tag(”li”)
a_h3 = final_output_soup.new_tag(”a”, href=”#” + sub_entry[’id’])
a_h3.append(sub_entry[’text’])
li_h3.append(a_h3)
ul_h3.append(li_h3)
li_h2.append(ul_h3)
ul_toc.append(li_h2)
article_toc_div.append(ul_toc)
final_output_soup.append(article_toc_div)

# Task 6 – IMPROVE READABILITY
# Bolding important key phrases
for h_tag in soup.find_all([’h2’, ’h3′]): # Only headings have content in this example
if ”Kattoremontti Pro hoitaa homman” in h_tag.get_text():
for content_node in list(h_tag.contents):
if isinstance(content_node, NavigableString):
text_parts = re.split(r'(Kattoremontti Pro)’, str(content_node))
for part in text_parts:
if part == ”Kattoremontti Pro”:
strong_tag = soup.new_tag(”strong”)
strong_tag.string = part
content_node.insert_before(strong_tag)
else:
content_node.insert_before(part)
content_node.decompose() # Remove original NavigableString
break

# Append all original content (now with IDs and bolding) to the final_output_soup
for content_element in soup.body.contents:
final_output_soup.append(content_element)

# Final cleanup pass for the *entire* final_output_soup to remove any residual empty elements
final_output_soup = remove_empty_tags_and_unwrap_recursively(final_output_soup)

return str(final_output_soup)

# — Original Content —
original_html_content = ”””

Kattoremontti galleria

Huopakattoremontti

Huopakattoremontti ennen

[vc_single_image image=”891″ img_size=”large” alignment=”center” css=”.vc_custom_1586175634054{margin-bottom: 10px !important;}”]

Huopakattoremontti jälkeen

[vc_single_image image=”1074″ img_size=”large” alignment=”center” css=”.vc_custom_1586175639223{margin-bottom: 10px !important;}”]

Näin huopakattoremontti meillä tehdään

[vc_media_grid element_width=”3″ grid_id=”vc_gid:1586175623764-98f8fce5-6a57-3″ include=”891,897,899,909,922,923,908,906,914,905,912,911,907,913,917,941,947,950,953,956,989,993,992,991,916,949,963,951,952,990,1074,1075″]

Joskus kattoremontti voi yllättää, mutta Kattoremontti Pro hoitaa homman.

[vc_media_grid element_width=”3″ grid_id=”vc_gid:1586175623765-cd0279ef-227b-0″ include=”928,927,924,925,929,931,926,982,976,977,958,970,969,980,996,994″]

Piipunpellitys on ammattimiehen puuhaa

Piipunpellitys ennen

[vc_single_image image=”901″ img_size=”large” alignment=”center” onclick=”img_link_large”]

Piipunpellitys jälkeen

[vc_single_image image=”1079″ img_size=”large” alignment=”center” onclick=”img_link_large”]

[vc_media_grid element_width=”3″ grid_id=”vc_gid:1586175623767-3083247c-bc5a-1″ include=”901,984,1023,1008,1019,1004,1007,1010,1020,1016,1021,1005,1015,1014,1013,1026,1025,1028,1076,1079″]

Näin uudet vesikourut asentuvat paikoilleen

[vc_media_grid element_width=”3″ grid_id=”vc_gid:1586175623770-23c4d364-737b-2″ include=”957,959,967,966,961,962″]

”””

cleaned_html_output = clean_html_content(original_html_content)

# Task 7 – GENERATE METADATA
metadata_final = {
”category”: ”Kattoremontti”,
”subcategory”: ”Referenssit”,
”keywords”: [”kattoremontti”, ”huopakattoremontti”, ”piipunpellitys”, ”vesikourut”, ”kattohuolto”, ”kattotyöt”],
”isLocalSeo”: False
}
metadata_json_output = json.dumps(metadata_final, ensure_ascii=False)

# Final output format: HTML content followed by metadata JSON
final_output = cleaned_html_output + ”\n”

print(final_output)
”`
”`html

📋 Tiivistelmä

Tämä sivu esittelee erilaisia kattoremonttiprojekteja kuvagallerian muodossa. Se tarjoaa esimerkkejä huopakattoremonteista, piipunpellityksistä ja vesikourujen asennuksista, näyttäen työn kulkua ennen ja jälkeen -kuvien avulla.

📑 Sisällysluettelo

Kattoremontti galleria
Huopakattoremontti
Piipunpellitys on ammattimiehen puuhaa
- Piipunpellitys ennen
- Piipunpellitys jälkeen
Näin uudet vesikourut asentuvat paikoilleen