Vous pouvez utiliser le sélecteur CSS avec ~
pour sélectionner les bons éléments à extraire :
from bs4 import BeautifulSoup
txt = '''
<div>This I want to keep</div>
<h2>
<span class="mw-headline" id="See_also">See also</span>
</h2>
<ul>
<li><a href="/wiki/List_of_adaptations_of_works_by_Stephen_King" title="List of adaptations of works by Stephen King">List of adaptations of works by Stephen King</a></li>
<li><a href="/wiki/Castle_Rock_(Stephen_King)" title="Castle Rock (Stephen King)">Castle Rock (Stephen King)</a></li>
<li><a href="/wiki/Charles_Scribner%27s_Sons" title="Charles Scribner's Sons">Charles Scribner's Sons</a> (aka Scribner)</li>
<li><a href="/wiki/Derry_(Stephen_King)" title="Derry (Stephen King)">Derry (Stephen King)</a></li>
<li><a href="/wiki/Dollar_Baby" title="Dollar Baby">Dollar Baby</a></li>
<li><a href="/wiki/Jerusalem%27s_Lot_(Stephen_King)" title="Jerusalem's Lot (Stephen King)">Jerusalem's Lot (Stephen King)</a></li>
<li><i><a href="/wiki/Haven_(TV_series)" title="Haven (TV series)">Haven</a></i></li>
</ul>
'''
soup = BeautifulSoup(txt, 'html.parser')
for tag in soup.select('h2:contains("See also") ~ *, h2:contains("See also")'):
tag.extract()
print(soup)
Utiliser select avec le tag et ou son attribut + ^ :
for a in soup.select('a[href^="http://example.com"]'):
a.extract()
ou re.compile comme suit en l'absence de tag
find_text = soup.find_all(text = re.compile('http://www.example.com'))
for comment in find_text:
print(comment)
fixed_text = comment.replace(comment, '')
comment.replace_with(fixed_text)