/*
* This file is part of Catalog Builder - A movie catalog building software.
* Copyright (C) (2009) (Siva Chandran P) <siva.chandran.p@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <QFile>
#include <QRegExp>
#include <QDebug>
#include "htmlparser.h"
static QString g_TagsWithoutEndTag(" br input ");
static QString g_NonParsableTags(" style script meta input ");
HtmlNode::HtmlNode(Type type, HtmlNode *parent)
{
m_Type = type;
m_Parent = parent;
}
HtmlNode::~HtmlNode()
{
qDeleteAll(m_Children.begin(), m_Children.end());
m_Children.clear();
}
HtmlNode *HtmlNode::firstChild(HtmlNode::Type type)
{
if (m_Children.size() > 0)
return m_Children[0];
else
return NULL;
HtmlNode *firstChild = NULL;
for (int i = 0; i < m_Children.size(); i++)
{
if (m_Children[i]->m_Type == type)
{
firstChild = m_Children[i];
break;
}
}
return firstChild;
}
HtmlNode *HtmlNode::prevSibling()
{
int thisNodeIndex = m_Parent->m_Children.indexOf(this);
Q_ASSERT(thisNodeIndex != -1);
if (thisNodeIndex - 1 >= 0)
return m_Parent->m_Children[thisNodeIndex - 1];
else
return NULL;
}
HtmlNode *HtmlNode::nextSibling()
{
int thisNodeIndex = m_Parent->m_Children.indexOf(this);
Q_ASSERT(thisNodeIndex != -1);
if (thisNodeIndex + 1 < m_Parent->m_Children.size())
return m_Parent->m_Children[thisNodeIndex + 1];
else
return NULL;
}
void HtmlNode::dump(int indent /*= 0*/)
{
QString msg;
for (int i = 0; i < indent * 4; i++)
msg.append(" ");
switch (m_Type)
{
case kTypeTag:
msg.append("[Tag]");
msg.append(QString().sprintf(" <%s>", qPrintable(((HtmlTag *)this)->m_Name)));
if (((HtmlTag *)this)->m_Attributes.size() > 0)
{
msg.append(" [");
for (int i = 0; i < ((HtmlTag *)this)->m_Attributes.size(); i++)
{
msg.append(((HtmlTag *)this)->m_Attributes[i]->m_Name);
msg.append("=");
msg.append(((HtmlTag *)this)->m_Attributes[i]->m_Value);
msg.append(",");
}
msg.append("]");
}
break;
case kTypeTagAttribute:
msg.append("[Attribute]");
// msg.append(QString().sprintf(" <%s>", qPrintable(((HtmlTagAttribute *)this)->m_Name)));
break;
case kTypeTagText:
msg.append("[Text]");
// msg.append(QString().sprintf(" {%s}", qPrintable(((HtmlTagText *)this)->m_Text)));
break;
default:
msg.append("[Unknown]");
break;
}
msg.append(QString().sprintf(" (%p)", this));
qDebug() << msg;
// for (int i = 0; i < m_Children.size(); i++)
// m_Children[i]->dump(indent + 1);
}
HtmlTagAttribute::HtmlTagAttribute(HtmlTag *tag, QString name, QString value)
:HtmlNode(kTypeTagAttribute, tag)
{
m_Name = name;
m_Value = value;
}
QString HtmlTagAttribute::toString()
{
return m_Name + "=" + "\"" + m_Value + "\"";
}
HtmlTagText::HtmlTagText(HtmlTag *tag, const QString text)
:HtmlNode(kTypeTagText, tag)
{
m_Text = text;
}
QString HtmlTagText::toString()
{
return m_Text;
}
void HtmlTagText::decodeHtmlEntities(QString *text)
{
// TODO: parse the hex/decimal value after & and replace accordingly
if (text->contains("'"))
{
*text = text->replace("'", "'");
}
}
HtmlTag::HtmlTag(HtmlTag *parent, QString name)
:HtmlNode(kTypeTag, parent)
{
Q_ASSERT(name != "!--");
m_Name = name;
}
HtmlTag::~HtmlTag()
{
qDeleteAll(m_Attributes.begin(), m_Attributes.end());
m_Attributes.clear();
}
bool HtmlTag::hasText()
{
for (int i = 0; i < m_Children.size(); i++)
{
if (m_Children[i]->m_Type == HtmlNode::kTypeTagText)
return true;
}
return false;
}
QString HtmlTag::getText()
{
QString text;
for (int i = 0; i < m_Children.size(); i++)
{
if (m_Children[i]->m_Type == HtmlNode::kTypeTagText)
text.append(((HtmlTagText *)m_Children[i])->m_Text);
}
return text;
}
bool HtmlTag::doesTagHasEndTag(const QString tagName)
{
return (g_TagsWithoutEndTag.indexOf(" " + tagName + " ") == -1);
}
bool HtmlTag::doesTagContentParsable(const QString tagName)
{
return (g_NonParsableTags.indexOf(" " + tagName + " ") == -1);
}
QString HtmlTag::toString()
{
QString buf;
if (m_Name.isEmpty())
{
for (int i = 0; i < m_Children.size(); i++)
buf.append(((HtmlTag *)m_Children[i])->toString());
return buf;
}
buf.append("<");
buf.append(m_Name);
if (m_Name == "!--")
{
for (int i = 0; i < m_Children.size(); i++)
buf.append(m_Children[i]->toString());
buf.append("-->");
// buf.append("\r\n");
return buf;
}
for (int i = 0; i < m_Attributes.size(); i++)
buf.append(" " + m_Attributes[i]->toString() );
bool hasEndTag = HtmlTag::doesTagHasEndTag(m_Name);
if (!hasEndTag || m_Children.size() == 0)
{
buf.append(" />");
Q_ASSERT(m_Children.size() == 0); // make sure we don't have any child
}
else
{
buf.append(">");
for (int i = 0; i < m_Children.size(); i++)
buf.append(m_Children[i]->toString());
buf.append("</");
buf.append(m_Name);
buf.append(">");
}
// buf.append("\r\n");
return buf;
}
HtmlParser::HtmlParser()
:m_RootTag(NULL, "")
{
}
HtmlParser::~HtmlParser()
{
}
bool HtmlParser::parseFile(const QString filename)
{
QFile file(filename);
if (!file.open(QIODevice::ReadOnly))
return false;
QString buffer(file.readAll());
file.close();
return parseBuffer(buffer);
}
bool HtmlParser::parseBuffer(const QString &buffer)
{
int start = 0;
return parseTagContent(buffer, m_RootTag, &start, buffer.size() - 1);
}
bool HtmlParser::parseTagContent(const QString &buffer, HtmlTag &tag,
int *start, int end)
{
int appendFrom = *start;
int offset = *start;
while (offset <= end)
{
if (buffer[offset] == '<')
{
// append till before this character
if (appendFrom < offset)
{
int len = offset - appendFrom;
QString text = buffer.mid(appendFrom, len);
tag.m_Children.append(new HtmlTagText(&tag, text));
}
if (offset < end && buffer[offset + 1] == '/')
{
// int endTagEndsAt = buffer.indexOf('>', offset);
// if (endTagEndsAt == -1)
// {
// qDebug() << "Can't find end tag for " << tag.m_Name;
// return false;
// }
//
// int endTagNameStartsAt = offset + 2; // after '</'
//
// QString endTag = buffer.mid(endTagNameStartsAt, endTagEndsAt - endTagNameStartsAt);
// if (endTag.compare(tag.m_Name, Qt::CaseInsensitive) == 0)
// {
// // consume the end tag only if it is corresponding to this
// // tag
// *start = endTagEndsAt + 1;
// }
// else
// *start = offset;
*start = offset;
return true;
}
int index = offset;
if (!parseTag(buffer, tag, &offset, end))
{
qDebug("parseTag failed");
return false;
}
if (offset < index)
return false;
appendFrom = offset;
}
else
offset++;
}
// append whatever we collected so far
if (appendFrom < offset)
{
int len = offset - appendFrom;
QString text = buffer.mid(appendFrom, len);
tag.m_Children.append(new HtmlTagText(&tag, text));
}
return true;
}
bool HtmlParser::parseTag(const QString &buffer, HtmlTag &parentTag,
int *start, int end)
{
// we assume *start points to the starting of the tag
Q_ASSERT(buffer[*start] == '<');
// parses the tag name
QRegExp tagNameRegExp("\\<([^ \t\n>]+)");
int startTagStartsAt = *start;
int ret = tagNameRegExp.indexIn(buffer, startTagStartsAt);
if (ret == -1)
{
qDebug("Can't extract tag name");
return false;
}
QString tagName = tagNameRegExp.cap(1);
QString startTagTerminator = ">";
bool ignoreTag = false;
if (tagName.startsWith("!--"))
{
tagName = "!--";
ignoreTag = true;
startTagTerminator = "-->"; // comment tag ends with -->
}
else if (tagName.startsWith("!DOCTYPE"))
{
ignoreTag = true;
}
int startTagEndsAt = buffer.indexOf(startTagTerminator, startTagStartsAt);
if (startTagEndsAt == -1 || startTagEndsAt > end)
{
qDebug("Can't find start tag's end");
return false;
}
startTagEndsAt += startTagTerminator.length();
// the complete start tag(<tagname ... >)
QString startTag = buffer.mid(*start, startTagEndsAt - startTagStartsAt);
// update the cursor with whatever we consumed already
*start = startTagEndsAt;
if (ignoreTag)
{
// qDebug("Ignoring tag %s", qPrintable(tagName));
return true;
}
HtmlTag *tag = new HtmlTag(&parentTag, tagName);
parseTagAttributes(startTag, *tag);
if (!startTag.endsWith("/>") && HtmlTag::doesTagHasEndTag(tagName))
{
if (HtmlTag::doesTagContentParsable(tagName))
parseTagContent(buffer, *tag, start, end);
QString endTag("</" + tagName + ">");
int endTagStartsAt = buffer.indexOf(endTag, *start);
if (endTagStartsAt > -1)
{
if (*start < endTagStartsAt)
{
int textStartsAt = *start;
QString text = buffer.mid(textStartsAt,
endTagStartsAt - textStartsAt);
tag->m_Children.append(new HtmlTagText(tag, text));
}
// update the cursor with whatever we consumed already
*start = endTagStartsAt + endTag.length();
}
}
parentTag.m_Children.append(tag);
return true;
}
bool HtmlParser::parseTagAttributes(const QString &startTag, HtmlTag &tag)
{
QRegExp attributeRegExp("(\\S+)\\s?=\\s?(\"[^\"]*\"|[^ >\"]+)");
int index = 0;
while (1)
{
index = attributeRegExp.indexIn(startTag, index);
if (index == -1)
break;
HtmlTagAttribute *htmlTagAttribute;
htmlTagAttribute = new HtmlTagAttribute(&tag, attributeRegExp.cap(1),
attributeRegExp.cap(2));
// if the value enclosed in double quotes then remove it
int len = htmlTagAttribute->m_Value.length();
if (len > 2 && htmlTagAttribute->m_Value[0] == '\"'
&& htmlTagAttribute->m_Value[len - 1] == '\"')
{
htmlTagAttribute->m_Value = htmlTagAttribute->m_Value.mid(1, len - 2);
}
tag.m_Attributes.append(htmlTagAttribute);
index += attributeRegExp.matchedLength();
}
return true;
}
void HtmlParser::dump()
{
m_RootTag.dump();
}
QString HtmlParser::toString()
{
return m_RootTag.toString();
}