To also remove any text that resembles a URL from your HTML content, you can enhance the script with regular expressions to detect and remove URLs. Python's re module can be utilized for this purpose.
Here's how you can update the script to remove both <a href> tags and any text that appears to be a URL:
class Command(BaseCommand):
def handle(self, *args, **options):
"""
Entery point
"""
jobs = Job.objects.filter(status__exact=0)
for job in jobs:
try:
self.remove_a_tags_and_urls(job)
except:
print('Error:' + str(job.id) + " " + job.title)
#import pdb;pdb.set_trace()
def remove_a_tags_and_urls(self, job):
soup = BeautifulSoup(job.rewrite, 'html.parser')
for a_tag in soup.find_all('a'):
a_tag.decompose()
text = str(soup)
url_pattern = r'https?://\S+|www\.\S+'
clean_text = re.sub(url_pattern, '', text)
final_soup = BeautifulSoup(clean_text, 'html.parser')
job.rewrite = str(final_soup)
#print(job.rewrite)
print(job.id)
#import pdb;pdb.set_trace()
job.save()
Comments
Post a Comment