”`python
import re
from bs4 import BeautifulSoup, NavigableString, Tag
import json
def generate_id(text):
”””Generates a URL-friendly ID from a given text string.”””
text = text.lower()
text = text.replace(’ä’, ’a’).replace(’ö’, ’o’).replace(’å’, ’a’)
text = re.sub(r'[^\w\s-]’, ”, text) # Remove non-word chars except spaces and hyphens
text = re.sub(r'[\s_]+’, ’-’, text) # Replace spaces and underscores with single hyphen
text = text.strip(’-’) # Remove leading/trailing hyphens
return text
def clean_html_content(html_content):
”””
Cleans up WordPress HTML content according to specified rules.
”””
# Step 0: Pre-process WPBakery shortcodes to a more standard HTML-like structure
# This is a simplification; a full WPBakery parser would be more robust.
# Replace
processed_html = html_content.replace(’
’)
processed_html = processed_html.replace(’
’)
# Use a placeholder class to identify vc_column_text divs for later attribute removal
processed_html = re.sub(r’\
]*)\]’, r’
processed_html = processed_html.replace(’
’)
# Unescape HTML entities for attributes (e.g., ” for ”)
processed_html = processed_html.replace(’”’, ’”’).replace(’″’, ’”’)
# Remove WPBakery shortcodes for images as per rules (no valid URLs provided in source HTML)
processed_html = re.sub(r’\[vc_single_image[^\]]*\]’, ”, processed_html)
processed_html = re.sub(r’\[vc_media_grid[^\]]*\]’, ”, processed_html)
soup = BeautifulSoup(processed_html, ’html.parser’)
# Task 1 – CLEAN HTML FORMATTING
# 1.1 Remove all Elementor/WPBakery classes, inline styles, and specific attributes
for tag in soup.find_all(True):
attrs_to_remove = []
for attr in tag.attrs:
if attr.startswith((’elementor-’, ’vc_’)) or attr in [’class’, ’style’, ’gap’, ’width’, ’alignment’, ’img_size’, ’onclick’, ’include’, ’grid_id’]:
attrs_to_remove.append(attr)
for attr in attrs_to_remove:
del tag.attrs[attr]
# 1.2 Remove H1 tags -> convert to H2
for h1_tag in soup.find_all(’h1’):
h1_tag.name = ’h2’
# 1.3 Remove empty tags and unnecessary nesting recursively
def remove_empty_tags_and_unwrap_recursively(current_soup):
changed = True
while changed:
changed = False
for tag in current_soup.find_all(True):
# Remove empty tags that are not self-closing and don’t contain significant content
if not tag.contents and not tag.is_empty_element and tag.name not in [’img’, ’br’, ’hr’, ’iframe’]:
tag.decompose()
changed = True
continue
# Unwrap divs that have no attributes and contain only one child or only whitespace
if tag.name == ’div’ and not tag.attrs:
if len(tag.contents) == 1 and isinstance(tag.contents[0], Tag):
if tag.contents[0].name not in [’div’, ’p’, ’ul’, ’ol’, ’table’]: # Avoid unwrapping content that needs parent block
tag.unwrap()
changed = True
elif not tag.get_text(strip=True) and not any(isinstance(c, Tag) for c in tag.contents): # Only whitespace or empty
tag.decompose()
changed = True
elif all(isinstance(c, NavigableString) and not c.strip() for c in tag.contents): # Only whitespace NavigableStrings
tag.decompose()
changed = True
return current_soup
soup = remove_empty_tags_and_unwrap_recursively(soup)
# Re-structure loose text content into paragraphs
final_body_contents = []
current_p_text = []
for content in soup.body.contents:
if isinstance(content, NavigableString) and content.strip():
current_p_text.append(content.strip())
elif isinstance(content, Tag):
if current_p_text:
new_p = soup.new_tag(”p”)
new_p.string = ” ”.join(current_p_text)
final_body_contents.append(new_p)
current_p_text = []
final_body_contents.append(content)
else: # Handle whitespace NavigableString
pass
if current_p_text:
new_p = soup.new_tag(”p”)
new_p.string = ” ”.join(current_p_text)
final_body_contents.append(new_p)
# Replace body contents with the newly structured list
soup.body.clear()
for item in final_body_contents:
soup.body.append(item)
# Create a new soup for the final output structure (summary, ToC, then content)
final_output_soup = BeautifulSoup(””, ”html.parser”)
# Task 3 – ADD SUMMARY AND KEY TAKEAWAYS (two-column layout)
summary_text = ”Tämä sivu esittelee erilaisia kattoremonttiprojekteja kuvagallerian muodossa. Se tarjoaa esimerkkejä huopakattoremonteista, piipunpellityksistä ja vesikourujen asennuksista, näyttäen työn kulkua ennen ja jälkeen -kuvien avulla.”
# Article text content is very short (under 300 words), so Avainkohdat will be skipped.
article_intro_div = final_output_soup.new_tag(”div”, class_=”article-intro”)
article_summary_div = final_output_soup.new_tag(”div”, class_=”article-summary”)
h2_summary = final_output_soup.new_tag(”h2”)
h2_summary.append(”📋 Tiivistelmä”)
article_summary_div.append(h2_summary)
p_summary = final_output_soup.new_tag(”p”)
p_summary.append(summary_text)
article_summary_div.append(p_summary)
article_intro_div.append(article_summary_div)
# Avainkohdat section is intentionally omitted as per rules for short articles.
final_output_soup.append(article_intro_div)
# Task 4 & 5 – ADD TABLE OF CONTENTS AND IDS TO HEADINGS
toc_data = []
headings = soup.find_all([’h2’, ’h3’])
previous_h2 = None
for heading in headings:
heading_text = heading.get_text().strip()
if not heading_text: # Skip empty headings
continue
heading_id = generate_id(heading_text)
heading[’id’] = heading_id
if heading.name == ’h2’:
current_h2_entry = {’text’: heading_text, ’id’: heading_id, ’subheadings’: []}
toc_data.append(current_h2_entry)
previous_h2 = current_h2_entry
elif heading.name == ’h3’ and previous_h2:
previous_h2[’subheadings’].append({’text’: heading_text, ’id’: heading_id})
elif heading.name == ’h3’ and not previous_h2: # Handle H3s that might appear before any H2
toc_data.append({’text’: heading_text, ’id’: heading_id, ’subheadings’: []})
if toc_data:
article_toc_div = final_output_soup.new_tag(”div”, class_=”article-toc”)
h2_toc = final_output_soup.new_tag(”h2”)
h2_toc.append(”📑 Sisällysluettelo”)
article_toc_div.append(h2_toc)
ul_toc = final_output_soup.new_tag(”ul”)
for entry in toc_data:
li_h2 = final_output_soup.new_tag(”li”)
a_h2 = final_output_soup.new_tag(”a”, href=”#” + entry[’id’])
a_h2.append(entry[’text’])
li_h2.append(a_h2)
if entry[’subheadings’]:
ul_h3 = final_output_soup.new_tag(”ul”)
for sub_entry in entry[’subheadings’]:
li_h3 = final_output_soup.new_tag(”li”)
a_h3 = final_output_soup.new_tag(”a”, href=”#” + sub_entry[’id’])
a_h3.append(sub_entry[’text’])
li_h3.append(a_h3)
ul_h3.append(li_h3)
li_h2.append(ul_h3)
ul_toc.append(li_h2)
article_toc_div.append(ul_toc)
final_output_soup.append(article_toc_div)
# Task 6 – IMPROVE READABILITY
# Bolding important key phrases
for h_tag in soup.find_all([’h2’, ’h3′]): # Only headings have content in this example
if ”Kattoremontti Pro hoitaa homman” in h_tag.get_text():
for content_node in list(h_tag.contents):
if isinstance(content_node, NavigableString):
text_parts = re.split(r'(Kattoremontti Pro)’, str(content_node))
for part in text_parts:
if part == ”Kattoremontti Pro”:
strong_tag = soup.new_tag(”strong”)
strong_tag.string = part
content_node.insert_before(strong_tag)
else:
content_node.insert_before(part)
content_node.decompose() # Remove original NavigableString
break
# Append all original content (now with IDs and bolding) to the final_output_soup
for content_element in soup.body.contents:
final_output_soup.append(content_element)
# Final cleanup pass for the *entire* final_output_soup to remove any residual empty elements
final_output_soup = remove_empty_tags_and_unwrap_recursively(final_output_soup)
return str(final_output_soup)
# — Original Content —
original_html_content = ”””
Kattoremontti galleria
Huopakattoremontti
Huopakattoremontti ennen
Huopakattoremontti jälkeen
Näin huopakattoremontti meillä tehdään
Joskus kattoremontti voi yllättää, mutta Kattoremontti Pro hoitaa homman.
Piipunpellitys on ammattimiehen puuhaa
Piipunpellitys ennen
Piipunpellitys jälkeen
Näin uudet vesikourut asentuvat paikoilleen
cleaned_html_output = clean_html_content(original_html_content)
# Task 7 – GENERATE METADATA
metadata_final = {
”category”: ”Kattoremontti”,
”subcategory”: ”Referenssit”,
”keywords”: [”kattoremontti”, ”huopakattoremontti”, ”piipunpellitys”, ”vesikourut”, ”kattohuolto”, ”kattotyöt”],
”isLocalSeo”: False
}
metadata_json_output = json.dumps(metadata_final, ensure_ascii=False)
# Final output format: HTML content followed by metadata JSON
final_output = cleaned_html_output + ”\n”
print(final_output)
”`
”`html
📋 Tiivistelmä
Tämä sivu esittelee erilaisia kattoremonttiprojekteja kuvagallerian muodossa. Se tarjoaa esimerkkejä huopakattoremonteista, piipunpellityksistä ja vesikourujen asennuksista, näyttäen työn kulkua ennen ja jälkeen -kuvien avulla.
📑 Sisällysluettelo
Kattoremontti galleria
Huopakattoremontti
Huopakattoremontti ennen
Huopakattoremontti jälkeen
Näin huopakattoremontti meillä tehdään
Joskus kattoremontti voi yllättää, mutta Kattoremontti Pro hoitaa homman.
Piipunpellitys on ammattimiehen puuhaa
Piipunpellitys ennen
Piipunpellitys jälkeen
Näin uudet vesikourut asentuvat paikoilleen
”`

