import os import re from bs4 import BeautifulSoup ############# Type 4 reference [TODOs] ############# def extract_tag_section(html: str, tag: str, verbose=False) -> str: """ \Label: CertainTagFinder Extracts the HTML section corresponding to a given tag (e.g., 'TODO'). Args: html (str): The HTML content as a string. tag (str): The tag to extract (e.g., 'TODO'). TODO: Finish the implementation to extract the section based on the tag. Add hyperlink to functions Extend it to bug fix and HACK Returns: str: The HTML string of the section corresponding to the tag, or an empty string if the tag is not found. """ soup = BeautifulSoup(html, 'html.parser') result = {} for func_div in soup.find_all('div', class_='function'): # Get the function name name_div = func_div.find('div', class_='function-name') func_name = name_div.get_text(strip=True) if name_div else 'Unnamed Function' #finds ( in function name func_name = func_name.split('(')[0].strip() # Find TODO section title todos = [] for section in func_div.find_all('div', class_='section-title'): if section.text.strip().upper().startswith(tag.upper()): # Collect all sibling divs until next section-title or end for sibling in section.find_next_siblings(): if sibling.name == 'div' and 'section-title' in sibling.get('class', []): break todos.append(sibling.get_text(strip=True)) if verbose: print(f"Found {tag} in {func_name}: {sibling.get_text(strip=True)}") break # Assume only one TODO per function block if todos: result[func_name] = '\n'.join(todos) return result def create_todo_html(tag:list, rel_path, key='TODO'): """ \Label: create_todo_html Generates HTML for a list of TODOs, each linked to its function. Args: tag (dict): A dictionary where keys are function names and values are TODO content. rel_path (str): The relative path to the HTML file for linking. Returns: str: HTML string containing the TODO sections. Example: >>> todo = {'calculate_area': 'Fix the calculation for negative radius', 'greet': 'Add support for multiple languages'} >>> rel_path = 'docs/functions.html' >>> create_todo_html(todo, rel_path) """ if tag: html = '' for func_name, content in tag.items(): anchor = func_name.replace(' ', '_') # simple anchor generation html += f'
, , , or any ancestor tags
if parent.name in ['code', 'pre', 'a'] or any(ancestor.name == 'a' for ancestor in parent.parents):
continue
def replace_match(match):
word = match.group(0)
if word == 'notes':
return word # Do not hyperlink the word 'notes'
target_path = file_name_to_html_path[word]
# Compute relative path from current file to target file
rel_path = os.path.relpath(target_path, start=os.path.dirname(current_html_path))
rel_path = rel_path.replace('\\', '/') # Ensure URL-safe path
# Return the hyperlink-wrapped word
return f'{word}'
# Replace file name matches in the text node
new_text = re.sub(pattern, replace_match, text_node)
# Replace the original text with new HTML
text_node.replace_with(BeautifulSoup(new_text, 'html.parser'))
return str(soup)
############# Type 2 reference [Function name Manual] #############
def process_html_for_labels(html_content):
"""
Process HTML content to replace \Label tags with HTML anchors and collect all labels.
This function scans the HTML for `\Label: ` patterns, replaces each
with an HTML anchor (for linking), and returns the modified HTML content along
with a list of the extracted labels.
Args:
html_content (str): Raw HTML string containing \Label tags.
Returns:
tuple:
- processed_html (str): HTML with anchor tags inserted.
- labels (list[str]): List of extracted label names.
"""
labels = []
def replace_label(match):
label = match.group(1)
labels.append(label)
# Insert label visually and as an anchor target
return f' {label.split(r"\\Label: ")[0]}
'
# Regex to find all \Label: label_name patterns
processed_html = re.sub(r'\\Label:\s*(\w+)', replace_label, html_content)
return processed_html, labels
# Second pass: Replace \Ref with hyperlinks
def process_html_for_labels_replace(processed_files: list, label_to_file: dict):
"""
Replace \Ref tags in HTML files with hyperlinks to corresponding label anchors.
For each HTML file in `processed_files`, this function reads the file, finds all
`\Ref: ` tags, and replaces them with HTML links pointing to the file
and anchor associated with that label (if available in `label_to_file`).
Args:
processed_files (list[dict]): List of processed file metadata dictionaries,
each containing at least a 'html_path' key.
label_to_file (dict): A mapping from label names to the HTML file path
containing the corresponding anchor.
"""
for file in processed_files:
html_path = file['html_path']
# Read the HTML file content
with open(html_path, 'r') as f:
content = f.read()
def replace_ref(match):
label = match.group(1)
if label in label_to_file:
target_path = label_to_file[label]
# Compute relative path to the label’s file
rel_path = os.path.relpath(target_path, start=os.path.dirname(html_path))
rel_path = rel_path.replace('\\', '/') # Normalize Windows paths for web
# Create clickable link to label anchor
return f''
else:
print(f"Warning: undefined reference '{label}' in {html_path}")
# Leave the original text unchanged if label is not found
return match.group(0)
# Replace all \Ref: label_name tags in the HTML
content = re.sub(r'\\Ref:\s*(\w+)', replace_ref, content)
# Write the modified content back to the file
with open(html_path, 'w') as f:
f.write(content)
if __name__ == "__main__":
# Example usage
with open('docs/src/PyThon/Viscosity/PositionalEncoding/main.html', 'r') as f:
html_content = f.read()
processed_html = extract_tag_section(html_content,'function')
print("Processed HTML:", processed_html)