from mechanize import Browser
br = Browser()
br.open('http://somewebpage')
html = br.response().readlines()
for line in html:
  print line

当在HTML文件中打印一行时,我试图找到一种方法,只显示每个HTML元素的内容,而不是格式本身。如果它发现'<a href="等等。例如">some text</a>',它只会打印'some text', '<b>hello</b>'打印'hello',等等。该怎么做呢?


当前回答

你可以使用BeautifulSoup get_text()特性。

from bs4 import BeautifulSoup

html_str = '''
<td><a href="http://www.fakewebsite.example">Please can you strip me?</a>
<br/><a href="http://www.fakewebsite.example">I am waiting....</a>
</td>
'''
soup = BeautifulSoup(html_str)

print(soup.get_text())
#or via attribute of Soup Object: print(soup.text)

建议显式地指定解析器,例如BeautifulSoup(html_str, features="html.parser"),以便输出可重现。

其他回答

如果你想去掉所有HTML标签,我发现最简单的方法是使用BeautifulSoup:

from bs4 import BeautifulSoup  # Or from BeautifulSoup import BeautifulSoup

def stripHtmlTags(htmlTxt):
    if htmlTxt is None:
            return None
        else:
            return ''.join(BeautifulSoup(htmlTxt).findAll(text=True)) 

我尝试了接受的答案的代码,但我得到了“RuntimeError:最大递归深度超出”,这没有发生在上面的代码块。

下面是一个简单的解决方案,基于惊人的快速lxml库剥离HTML标签并解码HTML实体:

from lxml import html

def strip_html(s):
    return str(html.fromstring(s).text_content())

strip_html('Ein <a href="">sch&ouml;ner</a> Text.')  # Output: Ein schöner Text.

如果你需要保留HTML实体(即&),我在Eloff的答案中添加了“handle_entityref”方法。

from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_entityref(self, name):
        self.fed.append('&%s;' % name)
    def get_data(self):
        return ''.join(self.fed)

def html_to_text(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

有一个简单的方法:

def remove_html_markup(s):
    tag = False
    quote = False
    out = ""

    for c in s:
        if c == '<' and not quote:
            tag = True
        elif c == '>' and not quote:
            tag = False
        elif (c == '"' or c == "'") and tag:
            quote = not quote
        elif not tag:
            out = out + c

    return out

这里解释了这个想法:http://youtu.be/2tu9LTDujbw

你可以在这里看到它的工作:http://youtu.be/HPkNPcYed9M?t=35s

PS -如果你对这个课程(关于使用python进行智能调试)感兴趣,我给你一个链接:http://www.udacity.com/overview/Course/cs259/CourseRev/1。它是免费的!

欢迎你!:)

这是一个快速修复,甚至可以更优化,但它将工作良好。这段代码将用""替换所有非空标记,并从给定的输入文本中剥离所有html标记。你可以使用./file.py输入输出运行它

    #!/usr/bin/python
import sys

def replace(strng,replaceText):
    rpl = 0
    while rpl > -1:
        rpl = strng.find(replaceText)
        if rpl != -1:
            strng = strng[0:rpl] + strng[rpl + len(replaceText):]
    return strng


lessThanPos = -1
count = 0
listOf = []

try:
    #write File
    writeto = open(sys.argv[2],'w')

    #read file and store it in list
    f = open(sys.argv[1],'r')
    for readLine in f.readlines():
        listOf.append(readLine)         
    f.close()

    #remove all tags  
    for line in listOf:
        count = 0;  
        lessThanPos = -1  
        lineTemp =  line

            for char in lineTemp:

            if char == "<":
                lessThanPos = count
            if char == ">":
                if lessThanPos > -1:
                    if line[lessThanPos:count + 1] != '<>':
                        lineTemp = replace(lineTemp,line[lessThanPos:count + 1])
                        lessThanPos = -1
            count = count + 1
        lineTemp = lineTemp.replace("&lt","<")
        lineTemp = lineTemp.replace("&gt",">")                  
        writeto.write(lineTemp)  
    writeto.close() 
    print "Write To --- >" , sys.argv[2]
except:
    print "Help: invalid arguments or exception"
    print "Usage : ",sys.argv[0]," inputfile outputfile"