用状态机实现XML解析器(2)

用C/C++的switch语句,很容易实现状态分析法:每一种状态对应一段case代码.

BOOL XMLNode::LoadNode(const wchar_t* pszContent, const wchar_t* &pszEnd)
{
    ResetNode();
    const wchar_t* pCur = pszContent;
    const wchar_t* pBegin = NULL;
    const wchar_t* pEnd = NULL;
    xmlnode_state st = st_begin;

    wstr2wstr s2s;
    wchar_t chValueFlag;    // ' 或者 " 应该成对出现

    bool bStop = false;
    try
    {
        while(*pCur != 0 && !bStop)
        {
            switch(st)
            {
            case st_begin:
                {
                    if(pCur[0] == L'<')
                    {
                        _trace("########################################", NULL);
                        _trace("开始分析节点", pCur);

                        // 判断节点类型
                        if(pCur[1] == L'?')
                        {
                            // (1) "<?" 开头的是XML节点
                            pCur++;
                            m_type = et_xml;
                            st = st_tagstart;
                        }
                        else if(pCur[1]== L'!' && pCur[2] == L'-' && pCur[3] == L'-')
                        {
                            // (2) "<!--" 开头的是注释节点
                            pCur += 3;
                            m_type = et_comment;
                            st = st_commentstart;
                        }
                        else if(wcsncmp(pCur, L"<![CDATA[", 9) == 0)
                        {
                            // (2) "<![CDATA[" 开头 "]]>"结尾的是CDATA部件
                            pCur += 8;
                            m_type = et_cdata;
                            st = st_cdatastart;
                        }
                        else
                        {
                            st = st_tagstart;
                            m_type = et_normal;
                        }   
                    }
                    else
                    {
                        // 忽略所有'<'之前的字符
                        if(pCur[0] == L' '
                            || pCur[0] == L'/r'
                            || pCur[0] == L'/n'
                            || pCur[0] == L'/t')
                        {
                        }
                        else
                        {
                            goto error;
                        }
                    }
                }
                break;
            case st_tagstart:
                {
                    pBegin = pCur;
                    pEnd = NULL;
                    st = st_tagend;
                    pCur--;
                }
                break;
            case st_tagend:
                {
                    if(pCur[0] == L' ' ||
                        pCur[0] == L'>' ||
                        pCur[0] == L'/' && pCur[1] == L'>' && m_type == et_normal ||
                        pCur[0] == L'?' && pCur[1] == L'>' && m_type == et_xml
                        )
                    {
                        pEnd = pCur - 1;
                        st = st_attrnamestart;
                        pCur--;
                    }
                    else
                    {
                        // 非法tag名字符在此判断
                        if(pCur[0] == L'<' || pCur[0] == L'/')
                        {
                            _trace("tag名中出现了非法的字符", pCur);
                            goto error;
                        }
                    }

                    // 得到节点名称
                    if(pEnd != NULL)
                    {
                        if(getStr(pBegin, pEnd, m_strName))
                        {
                            pBegin = NULL;
                            pEnd = NULL;
                            _trace("tag Name", m_strName.c_str());
                        }
                        else
                        {
                            _trace("非法的tag", pBegin);
                            pCur = pBegin;
                            goto error;
                        }
                    }
                }
                break;
            case st_attrnamestart:
                {
                    if(L' ' == pCur[0])
                    {
                        // 跳过属性名前的空格
                    }
                    else
                    {
                        pBegin = pCur;
                        pEnd = NULL;
                        st = st_attrnameend;
                        pCur--;
                    }
                }
                break;
            case st_attrnameend:
                {
                    if(L'>' == pCur[0])
                    {
                        st = st_contentstart;
                    }
                    else if(L'/' == pCur[0] && L'>' == pCur[1] && m_type == et_normal ||
                        L'?' == pCur[0] && L'>' == pCur[1] && m_type == et_xml)
                    {
                        st = st_end;
                        pCur++;
                    }
                    else if(L'=' == pCur[0] || L' ' == pCur[0])
                    {
                        st = st_attrvaluestart;
                        pEnd = pCur - 1;
                    }
                    else
                    {
                    }
                    if(pEnd)
                    {
                        s2s.first = L"";
                        s2s.second = L"";
                        if(getStr(pBegin, pEnd, s2s.first))
                        {
                            _trace("属性名", s2s.first.c_str());
                        }
                        else
                        {
                            _trace("非法的属性名", pCur);
                            pCur = pBegin;
                            goto error;
                        }
                    }
                }
                break;
            case st_attrvaluestart:
                {
                    if(L'/'' == pCur[0] || L'/"' == pCur[0])
                    {
                        pBegin = pCur + 1;
                        pEnd = NULL;
                        st = st_attrvalueend;
                        chValueFlag = pCur[0];    // 记录'/"要成对出现
                    }
                    else if(L' ' == pCur[0])
                    {
                        // 属性名=后的空格过虑掉
                    }
                    else
                    {
                        _trace("属性名后有非法的字符", pCur);
                        goto error;
                    }
                }
                break;
            case st_attrvalueend:
                {
                    if((L'/'' == pCur[0] || L'/"' == pCur[0]) && pCur[0] == chValueFlag)
                    {
                        pEnd = pCur - 1;
                        getStr(pBegin, pEnd, s2s.second);
                        _trace("属性值", s2s.second.c_str());
                        m_AttrList.push_back(s2s);

                        if(
                            L' ' == pCur[1] ||
                            L'/' == pCur[1] && L'>' == pCur[2] && m_type == et_normal ||
                            L'?' == pCur[1] && L'>' == pCur[2] && m_type == et_xml ||
                            L'>' == pCur[1]
                            )
                        {
                            // 分析下一个属性
                            st = st_attrnamestart;
                        }
                        else
                        {
                            _trace("属性值/"//'之后发现非法字符", pCur);
                            goto error;
                        }
                    }
                    else
                    {
                        // 非法的属性值字符在此判断
                        // ..
                        // ..
                    }
                }
                break;
            case st_contentstart:
                {
                    // 不过虑空格
                    pBegin = pCur;
                    pEnd = NULL;
                    st = st_contentend;
                    pCur--;
                }
                break;
            case st_contentend:
                {
                    if(L'<' == pCur[0])
                    {
                        wstring strText;
                        pEnd = pCur - 1;
                        if(getStr(pBegin, pEnd, strText))
                        {
                            // 普通文本也作为一个子节点
                            _trace("content", strText.c_str());
                            if(isValidText(strText.c_str()))
                            {
                                XMLNode *pNode = new XMLNode;
                                pNode->m_type = et_text;
                                pNode->m_strText = strText;
                                linkChild(pNode);
                            }
                            else
                            {
                                _trace("无效内容文本", strText.c_str());
                            }
                        }
                        else
                        {
                            _trace("空内容", pBegin);
                        }

                        // 内容结束了,判断下一步操作
                        if(L'/' == pCur[1] && m_type == et_normal ||
                            L'?' ==pCur[1] && m_type == et_xml)
                        {                       
                            st = st_endtagstart;
                            pCur++;
                        }
                        else
                        {
                            st = st_child;
                            pCur--; // 子节点从"<"开始,所以回退1格
                        }
                        pBegin = NULL;
                        pEnd = NULL;
                    }
                    else
                    {
                        // 非法的内容字符在此判断
                        // ..
                        // ..
                    }
                }
                break;
            case st_cdatastart:
                {
                    pBegin = pCur;
                    pEnd = NULL;
                    st = st_cdataend;
                    pCur--;
                }
                break;
            case st_cdataend:
                {
                    if(wcsncmp(pCur, L"]]>", 3) == 0)
                    {
                        pEnd = pCur - 1;
                        getStr(pBegin, pEnd, m_strText); // CDATA文本也作为一个子节点
                        _trace("cdata content", m_strText.c_str());
                        // cdata结束了,判断下一步操作
                        pCur += 2;
                        st = st_end;
                    }
                    else
                    {
                        // 非法的内容字符在此判断
                        // ..
                        // ..
                    }
                }
                break;
            case st_commentstart:
                {
                    pBegin = pCur;
                    st = st_commentend;
                    pEnd = NULL;
                    pCur--;
                }
                break;
            case st_commentend:
                {
                    if(L'>' == pCur[0] && L'-' == *(pCur - 2) && L'-' == *(pCur - 1))
                    {
                        pEnd = pCur - 3;
                        getStr(pBegin, pEnd, m_strText);
                        _trace("comment content", m_strText.c_str());
                        st = st_end;
                    }
                    else
                    {
                        // 非法的注释字符在此判断
                        // ..
                        // ..
                    }
                }
                break;
            case st_endtagstart:
                {
                    pBegin = pCur;
                    pEnd = NULL;
                    st = st_endtagend;
                    pCur--;
                }
                break;
            case st_endtagend:
                {
                    if(L'>' == pCur[0])
                    {
                        pEnd = pCur - 1;
                        wstring strTag;
                        getStr(pBegin, pEnd, strTag);
                        _trace("endtagname", strTag.c_str());
                        if(strTag == m_strName)
                        {
                            st = st_end;
                        }
                        else
                        {
                            pCur = pBegin;
                            goto error;
                        }
                    }
                    else
                    {
                        //
                    }
                }
                break;
            case st_child:
                {
                    // 递归分析子节点
                    _trace("开始分析子节点", pCur);
                    XMLNode *pNode = new XMLNode;
                    if(pNode->LoadNode(pCur, pCur))
                    {
                        linkChild(pNode);
                        pCur--;
                        _trace("继续分析下一段内容(多一个字符)", pCur);
                        st = st_contentstart;   
                    }
                    else
                    {
                        delete pNode;
                        goto childerror;
                    }
                }
                break;
            case st_end:
                {
                    bStop = true;
                    pCur--;
                }
                break;
            default:
                {
                }
                break;
            }

            pCur++;
        }
    }
    catch (...)
    {
        _trace("捕捉到异常", NULL);
        goto error;
    }

    pszEnd = pCur;
    return st == st_end || st == st_begin;

error:
    _trace("发生错误, 原始内容", pszContent);
    _trace("错误位置", pCur);
childerror:
    pszEnd = pCur;
    return FALSE;
}

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wydggf.html