/**
 * This module provides a way of cleaning up the HTML coming from Word. It
 * expects to receive a complete Word HTML document (pastes coming from Word
 * work this way).
 */

/**
 * Removes Word "cruft" from the given document. This removes all elements that
 * are "special" word elements, along with all styles that begin with "Mso" and
 * all style rules that start with "mso".
 * @param element the element to remove Word cruft from
 */
function removeWordCruft(element: Element): void {
  // Basically: if the current element is "Word cruft" (starts with "o:" or "w:")
  // remove it and all its children.
  const tagPrefix = element.tagName.toLowerCase().substring(0, 2);
  if (tagPrefix === 'o:' || tagPrefix === 'w:') {
    element.parentElement.removeChild(element);
  } else {
    // If we're keeping the element, check the class list
    // Store classes to remove so we can iterate "safely" without having to
    // deal with indices changing.
    const toRemove: string[] = [];
    for (let idx = 0; idx < element.classList.length; idx++) {
      if (element.classList[idx].substring(0, 3) === 'Mso') {
        toRemove.push(element.classList[idx]);
      }
    }
    for (let idx = 0; idx < toRemove.length; idx++) {
      element.classList.remove(toRemove[idx]);
    }
    if (element.classList.length === 0) {
      element.removeAttribute('class');
    }
    // Remove CSS junk from the style rules if we can
    let style = element.getAttribute('style');
    if (style) {
      style = style.replace(/mso[-\w]+:[^;]*;?/g, '');
      if (/^\s*$/.test(style)) {
        element.removeAttribute('style');
      } else {
        element.setAttribute('style', style);
      }
    }
    // Currently, just remove comments - they're useless and potentially bloat
    // the text. (And may, in fact, be Word conditional comments.)
    const nodesToRemove: Node[] = [];
    for (let idx = 0; idx < element.childNodes.length; idx++) {
      const n = element.childNodes[idx];
      if (n.nodeType === Node.COMMENT_NODE) {
        nodesToRemove.push(n);
      }
    }
    for (let idx = nodesToRemove.length - 1; idx >= 0; idx--) {
      element.removeChild(nodesToRemove[idx]);
    }
    // And recurse
    for (let idx = 0; idx < element.children.length; idx++) {
      removeWordCruft(element.children[idx]);
    }
  }
}

class WordNumbering {
  constructor(public id: string, public level: string) {}
  get formatKey(): string {
    return this.id + ':' + this.level;
  }
  static parse(element: Element): WordNumbering | null {
    const style = element.getAttribute('style');
    if (!style) {
      return null;
    }
    const m = /\bmso-list:([^;]+)/.exec(style);
    if (!m) {
      return null;
    }
    const parts = m[1].split(/\s+/);
    return parts.length >= 2 ? new WordNumbering(parts[0], parts[1]) : null;
  }
  toString() {
    return '[WordNumbering id=' + this.id + ', level=' + this.level + ']';
  }
}

// Known mso-level-number-formats:
// decimal (default, probably)
// bullet
// alpha-lower
// roman-lower

class WordList {
  constructor(public id: string, public level: string, public numberFormat: string) { }
  get formatKey(): string {
    return this.id + ':' + this.level;
  }
  get ordered(): boolean {
    // Determine if this is an ordered list.
    return this.numberFormat !== 'bullet';
  }
  static parse(id: string, level: string, style: string): WordList | null {
    // We want the "mso-level-number-format" to determine the CSS styles for
    // the list
    const m = /\bmso-level-number-format:\s*(\w+)/.exec(style);
    return new WordList(id, level, m ? m[1] : 'decimal');
  }
  toString() {
    return '[WordList id=' + this.id + ', level=' + this.level + ', format=' + this.numberFormat +']';
  }
}

/**
 * Parses out all list definitions, if possible.
 * @param doc the document to parse word definitions in
 */
function parseListDefinitions(doc: Document): Map<string, WordList> {
  const result = new Map<string, WordList>();
  // Go through all style elements
  const styles = doc.querySelectorAll('style');
  for (let idx = 0; idx < styles.length; idx++) {
    const css = styles[idx].innerText;
    const listDefRegExp = /@list\s+(\w+):(\w+)\s*\{([^}]*)\}/mg;
    let m: RegExpExecArray;
    while ((m = listDefRegExp.exec(css)) !== null) {
      const listDefinition = WordList.parse(m[1], m[2], m[3]);
      if (listDefinition) {
        result.set(listDefinition.formatKey, listDefinition);
      }
    }
  }
  return result;
}

function convertWordLists(doc: Document): void {
  // Basically, we want to discover any HTML bits that might contain lists
  const possibleLists = Array.from(doc.querySelectorAll('*[style*=mso-list]'));
  const listFormats = parseListDefinitions(doc);
  for (let idx = 0; idx < possibleLists.length; idx++) {
    const element = possibleLists[idx];
    // Try and figure out what type of list this is
    const numbering = WordNumbering.parse(element);
    if (numbering) {
      // See if we can find this numbering
      const listDef = listFormats.get(numbering.formatKey);
      if (listDef) {
        // If this has a list definition, wrap it in a real list (if we can)
        const expectedList = listDef.ordered ? 'OL' : 'UL';
        const previous = element.previousElementSibling;
        let list: Element;
        if (previous && previous.tagName === expectedList) {
          list = previous;
        } else {
          // Otherwise, we need to create it
          list = doc.createElement(expectedList.toLowerCase());
          element.parentElement.insertBefore(list, element);
        }
        // Create a new list item element to hold this
        const li = doc.createElement('li');
        // Add it to our list
        list.appendChild(li);
        // firstChild is live-updated and append moves nodes so this will move
        // them to the new li
        let insideConditionalComment = false;
        while (element.firstChild !== null) {
          const child = element.firstChild;
          if (child.nodeType === Node.COMMENT_NODE) {
            const comment = child as Comment;
            if (comment.textContent === '[if !supportLists]') {
              insideConditionalComment = true;
            } else if (comment.textContent === '[endif]') {
              // Remove this but also stop removing other stuff
              element.removeChild(child);
              insideConditionalComment = false;
              continue;
            }
          }
          if (insideConditionalComment) {
            element.removeChild(child);
          } else {
            li.appendChild(child);
          }
        }
        element.parentElement.removeChild(element);
      }
    }
  }
}

/**
 * Converts the given HTML string into a DOM object.
 * @param html the HTML to parse
 */
function parseHTML(html: string): Document {
  const result = document.implementation.createHTMLDocument('Pasted');
  result.documentElement.innerHTML = html;
  return result;
}

/**
 * Cleans HTML coming from Word. Uses cleanWordDoc at its core, which
 * uses a parsed Document to do the actual HTML manipulation.
 * @param html the HTML to clean
 * @return the resulting HTML
 */
export function cleanWordHTML(html: string): string {
  return cleanWordDoc(html).body.innerHTML;
}

/**
 * Cleans HTML coming from Word. Returns a Document, which is a locally created
 * DOM that contains the cleaned Word document. (Note: only the HTML within the
 * <body> element is cleaned. The <head> will still contain Word styles.)
 * @param html the HTML to clean
 * @return the resulting DOM
 */
export function cleanWordDoc(html: string): Document {
  const wordDoc = parseHTML(html);
  convertWordLists(wordDoc);
  removeWordCruft(wordDoc.body);
  return wordDoc;
}

/**
 * Cleans HTML coming from Word. Returns a DocumentFragment which contains
 * whatever HTML elements were within the <body> of the given HTML.
 * @param html the HTML to clean
 */
export function cleanWordDocFragment(html: string): DocumentFragment {
  const wordDoc = cleanWordDoc(html);
  const fragment = wordDoc.createDocumentFragment(), root = wordDoc.body;
  // This actually moves the nodes from the document to the fragment, so create
  // an array to hold them.
  const nodes = Array.from(wordDoc.body.childNodes);
  for (let idx = 0; idx < nodes.length; idx++) {
    fragment.appendChild(nodes[idx]);
  }
  return fragment;
}

/**
 * Determine if the content is from Word. Looks for the Word namespaces in the
 * text without trying to parse the document.
 */
export function isWordHTML(html: string): boolean {
  const htmlMatch = /\<html[^>]*>/.exec(html);
  if (htmlMatch) {
    // Has an HTML tag.
    const htmlTag = htmlMatch[0];
    return /xmlns:\w+\s*=\s*"urn:schemas-microsoft-com:office:office"/.test(htmlTag) &&
      /xmlns:\w+\s*=\s*"urn:schemas-microsoft-com:office:word"/.test(htmlTag);
  }
  return false;
}