-
-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathHtmlToText.h
84 lines (69 loc) · 1.98 KB
/
HtmlToText.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
/* Copyright (C) 2022-2025 Stefan-Mihai MOGA
This file is part of WebSearchEngine application developed by Stefan-Mihai MOGA.
WebSearchEngine is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Open
Source Initiative, either version 3 of the License, or any later version.
WebSearchEngine is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
WebSearchEngine. If not, see <http://www.opensource.org/licenses/gpl-3.0.html>*/
#pragma once
class CHtmlToText
{
public:
CHtmlToText();
~CHtmlToText();
public:
const std::string& Convert(const std::string& html);
std::string ParseTag(bool& selfClosing);
void EatInnerContent(const std::string& tag);
bool EndOfText() { return (_pos >= _html.length()); };
char Peek() { return (_pos < _html.length()) ? _html[_pos] : (char)0; }
void MoveAhead() { _pos = ((_pos + 1 < _html.length()) ? (_pos + 1) : _html.length()); }
bool IsWhiteSpace(char ch)
{
if ((ch == _T(' ')) || (ch == _T('\t')) || (ch == _T('\r')) || (ch == _T('\n')))
return true;
return false;
}
void EatWhitespace()
{
while (IsWhiteSpace(Peek()))
MoveAhead();
}
void EatWhitespaceToNextLine()
{
while (IsWhiteSpace(Peek()))
{
char ch = Peek();
MoveAhead();
if (ch == _T('\n'))
break;
}
}
void EatQuotedValue()
{
char mark = Peek();
if ((mark == _T('\"')) || (mark == _T('\'')))
{
// Opening quote
MoveAhead();
// Find end of value
while (!EndOfText())
{
char ch = Peek();
MoveAhead();
if ((ch == mark) || (ch == _T('\r')) || (ch == _T('\n')))
break;
}
}
}
protected:
std::string _text;
std::string _html;
size_t _pos;
bool _preformatted;
CMapStringToString _tags;
CStringList _ignoreTags;
};