index.js

/*
 * This module encapsulates the official Jisho.org API
 * and also provides kanji and example search features that scrape Jisho.org.
 * Permission to scrape granted by Jisho's admin Kimtaro:
 *     https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api
 */

import axiosBuilder from 'axios';
import cheerio from 'cheerio';
import escapeStringRegexp from 'escape-string-regexp';
import { XmlEntities } from 'html-entities';

const axios = axiosBuilder.create({ timeout: 10000 });

const JISHO_API = 'https://jisho.org/api/v1/search/words';
const SCRAPE_BASE_URI = 'https://jisho.org/search/';

// This link does not use https because as of June 5, 2021 SSL is broken on classic.jisho.org
// (and even if it's been fixed since then, it will be safer to keep this as-is)
const STROKE_ORDER_DIAGRAM_BASE_URI = 'http://classic.jisho.org/static/images/stroke_diagrams/';

const htmlEntities = new XmlEntities();

/* KANJI SEARCH FUNCTIONS START */

const ONYOMI_LOCATOR_SYMBOL = 'On';
const KUNYOMI_LOCATOR_SYMBOL = 'Kun';

function removeNewlines(str) {
  return str.replace(/(?:\r|\n)/g, '').trim();
}

function uriForKanjiSearch(kanji) {
  return `${SCRAPE_BASE_URI}${encodeURIComponent(kanji)}%23kanji`;
}

function getUriForStrokeOrderDiagram(kanji) {
  return `${STROKE_ORDER_DIAGRAM_BASE_URI}${kanji.charCodeAt(0).toString()}_frames.png`;
}

function uriForPhraseSearch(phrase,page) {
  let uri = `${JISHO_API}?keyword=${encodeURIComponent(phrase)}`;
  if(page) {
    uri = `${uri}&page=${page}`;
  }

  return uri;
}


function containsKanjiGlyph(pageHtml, kanji) {
  const kanjiGlyphToken = `<h1 class="character" data-area-name="print" lang="ja">${kanji}</h1>`;
  return pageHtml.indexOf(kanjiGlyphToken) !== -1;
}

function getStringBetweenIndicies(data, startIndex, endIndex) {
  const result = data.substring(startIndex, endIndex);
  return removeNewlines(result).trim();
}

function getStringBetweenStrings(data, startString, endString) {
  const regex = new RegExp(`${escapeStringRegexp(startString)}(.*?)${escapeStringRegexp(endString)}`, 's');
  const match = data.match(regex);

  return match ? match[1] : undefined;
}

function getIntBetweenStrings(pageHtml, startString, endString) {
  const stringBetweenStrings = getStringBetweenStrings(pageHtml, startString, endString);
  if (stringBetweenStrings) {
    return parseInt(stringBetweenStrings, 10);
  }

  return undefined;
}

function getAllGlobalGroupMatches(str, regex) {
  let regexResult = regex.exec(str);
  const results = [];
  while (regexResult) {
    results.push(regexResult[1]);
    regexResult = regex.exec(str);
  }

  return results;
}

function parseAnchorsToArray(str) {
  const regex = /<a href=".*?">(.*?)<\/a>/g;
  return getAllGlobalGroupMatches(str, regex);
}

function getYomi(pageHtml, yomiLocatorSymbol) {
  const yomiSection = getStringBetweenStrings(pageHtml, `<dt>${yomiLocatorSymbol}:</dt>`, '</dl>');
  return parseAnchorsToArray(yomiSection || '');
}

function getKunyomi(pageHtml) {
  return getYomi(pageHtml, KUNYOMI_LOCATOR_SYMBOL);
}

function getOnyomi(pageHtml) {
  return getYomi(pageHtml, ONYOMI_LOCATOR_SYMBOL);
}

function getYomiExamples(pageHtml, yomiLocatorSymbol) {
  const locatorString = `<h2>${yomiLocatorSymbol} reading compounds</h2>`;
  const exampleSection = getStringBetweenStrings(pageHtml, locatorString, '</ul>');
  if (!exampleSection) {
    return [];
  }

  const regex = /<li>(.*?)<\/li>/gs;
  const regexResults = getAllGlobalGroupMatches(exampleSection, regex).map(s => s.trim());

  const examples = regexResults.map((regexResult) => {
    const examplesLines = regexResult.split('\n').map(s => s.trim());
    return {
      example: examplesLines[0],
      reading: examplesLines[1].replace('【', '').replace('】', ''),
      meaning: htmlEntities.decode(examplesLines[2]),
    };
  });

  return examples;
}

function getOnyomiExamples(pageHtml) {
  return getYomiExamples(pageHtml, ONYOMI_LOCATOR_SYMBOL);
}

function getKunyomiExamples(pageHtml) {
  return getYomiExamples(pageHtml, KUNYOMI_LOCATOR_SYMBOL);
}

function getRadical(pageHtml) {
  const radicalMeaningStartString = '<span class="radical_meaning">';
  const radicalMeaningEndString = '</span>';

  const radicalMeaning = getStringBetweenStrings(
    pageHtml,
    radicalMeaningStartString,
    radicalMeaningEndString,
  ).trim();

  if (radicalMeaning) {
    const radicalMeaningStartIndex = pageHtml.indexOf(radicalMeaningStartString);

    const radicalMeaningEndIndex = pageHtml.indexOf(
      radicalMeaningEndString,
      radicalMeaningStartIndex,
    );

    const radicalSymbolStartIndex = radicalMeaningEndIndex + radicalMeaningEndString.length;
    const radicalSymbolEndString = '</span>';
    const radicalSymbolEndIndex = pageHtml.indexOf(radicalSymbolEndString, radicalSymbolStartIndex);

    const radicalSymbolsString = getStringBetweenIndicies(
      pageHtml,
      radicalSymbolStartIndex,
      radicalSymbolEndIndex,
    );

    if (radicalSymbolsString.length > 1) {
      const radicalForms = radicalSymbolsString
        .substring(1)
        .replace('(', '')
        .replace(')', '')
        .trim()
        .split(', ');

      return { symbol: radicalSymbolsString[0], forms: radicalForms, meaning: radicalMeaning };
    }

    return { symbol: radicalSymbolsString, meaning: radicalMeaning };
  }

  return undefined;
}

function getParts(pageHtml) {
  const partsSectionStartString = '<dt>Parts:</dt>';
  const partsSectionEndString = '</dl>';

  const partsSection = getStringBetweenStrings(
    pageHtml,
    partsSectionStartString,
    partsSectionEndString,
  );

  return parseAnchorsToArray(partsSection).sort();
}

function getSvgUri(pageHtml) {
  const svgRegex = /\/\/.*?.cloudfront.net\/.*?.svg/;
  const regexResult = svgRegex.exec(pageHtml);
  return regexResult ? `https:${regexResult[0]}` : undefined;
}

function getGifUri(kanji) {
  const unicodeString = kanji.codePointAt(0).toString(16);
  const fileName = `${unicodeString}.gif`;
  const animationUri = `https://raw.githubusercontent.com/mistval/kanji_images/master/gifs/${fileName}`;

  return animationUri;
}

function getNewspaperFrequencyRank(pageHtml) {
  const frequencySection = getStringBetweenStrings(pageHtml, '<div class="frequency">', '</div>');
  return frequencySection ? getStringBetweenStrings(frequencySection, '<strong>', '</strong>') : undefined;
}

function parseKanjiPageData(pageHtml, kanji) {
  const result = {};
  result.query = kanji;
  result.found = containsKanjiGlyph(pageHtml, kanji);
  if (!result.found) {
    return result;
  }

  result.taughtIn = getStringBetweenStrings(pageHtml, 'taught in <strong>', '</strong>');
  result.jlptLevel = getStringBetweenStrings(pageHtml, 'JLPT level <strong>', '</strong>');
  result.newspaperFrequencyRank = getNewspaperFrequencyRank(pageHtml);
  result.strokeCount = getIntBetweenStrings(pageHtml, '<strong>', '</strong> strokes');
  result.meaning = htmlEntities.decode(removeNewlines(getStringBetweenStrings(pageHtml, '<div class="kanji-details__main-meanings">', '</div>')).trim());
  result.kunyomi = getKunyomi(pageHtml);
  result.onyomi = getOnyomi(pageHtml);
  result.onyomiExamples = getOnyomiExamples(pageHtml);
  result.kunyomiExamples = getKunyomiExamples(pageHtml);
  result.radical = getRadical(pageHtml);
  result.parts = getParts(pageHtml);
  result.strokeOrderDiagramUri = getUriForStrokeOrderDiagram(kanji);
  result.strokeOrderSvgUri = getSvgUri(pageHtml);
  result.strokeOrderGifUri = getGifUri(kanji);
  result.uri = uriForKanjiSearch(kanji);
  return result;
}

/* KANJI SEARCH FUNCTIONS END */

/* EXAMPLE SEARCH FUNCTIONS START */

const kanjiRegex = /[\u4e00-\u9faf\u3400-\u4dbf々]/g;

function uriForExampleSearch(phrase) {
  return `${SCRAPE_BASE_URI}${encodeURIComponent(phrase)}%23sentences`;
}

function getKanjiAndKana(div) {
  const ul = div.find('ul').eq(0);
  const contents = ul.contents();

  let kanji = '';
  let kana = '';
  for (let i = 0; i < contents.length; i += 1) {
    const content = contents.eq(i);
    if (content[0].name === 'li') {
      const li = content;
      const furigana = li.find('.furigana').text();
      const unlifted = li.find('.unlinked').text();

      if (furigana) {
        kanji += unlifted;
        kana += furigana;

        const kanaEnding = [];
        for (let j = unlifted.length - 1; j > 0; j -= 1) {
          if (!unlifted[j].match(kanjiRegex)) {
            kanaEnding.push(unlifted[j]);
          } else {
            break;
          }
        }

        kana += kanaEnding.reverse().join('');
      } else {
        kanji += unlifted;
        kana += unlifted;
      }
    } else {
      const text = content.text().trim();
      if (text) {
        kanji += text;
        kana += text;
      }
    }
  }

  return { kanji, kana };
}

function normalizeSentenceElement(sentenceElement) {
  const sentenceHtml = sentenceElement.html()
  const normalizedSentenceHtml = sentenceHtml.replace(
    /(?<=<\/li>)\s*([^\s<>]+)\s*(?=<li)/g,
    (m, g1) => `<li class="clearfix"><span class="unlinked">${g1}</span></li>`
  );
  const result = cheerio.load(normalizedSentenceHtml)
  return result
}

function getPieces(sentenceElement) {
  const pieceElements = normalizeSentenceElement(sentenceElement)('li.clearfix');
  const pieces = [];
  for (let pieceIndex = 0; pieceIndex < pieceElements.length; pieceIndex += 1) {
    const pieceElement = pieceElements.eq(pieceIndex);

    pieces.push({
      lifted: pieceElement.children('.furigana').text(),
      unlifted: pieceElement.children('.unlinked').text(),
    });
  }

  return pieces;
}

function parseExampleDiv(div) {
  const english = div.find('.english').text();
  const { kanji, kana } = getKanjiAndKana(div);

  return {
    english,
    kanji,
    kana,
    pieces: getPieces(div),
  };
}

function parseExamplePageData(pageHtml, phrase) {
  const $ = cheerio.load(pageHtml);
  const divs = $('.sentence_content');

  const results = [];
  for (let i = 0; i < divs.length; i += 1) {
    const div = divs.eq(i);
    results.push(parseExampleDiv(div));
  }

  return {
    query: phrase,
    found: results.length > 0,
    results,
    uri: uriForExampleSearch(phrase),
    phrase,
  };
}

/* EXAMPLE SEARCH FUNCTIONS END */

/* PHRASE SCRAPE FUNCTIONS START */

function getTags($) {
  const tags = [];

  const tagElements = $('.concept_light-tag');
  for (let i = 0; i < tagElements.length; i += 1) {
    const tagText = tagElements.eq(i).text();
    tags.push(tagText);
  }

  return tags;
}

function getMeaningsOtherFormsAndNotes($) {
  const returnValues = { otherForms: [], notes: [] };

  const meaningsWrapper = $('#page_container > div > div > article > div > div.concept_light-meanings.medium-9.columns > div');
  const meaningsChildren = meaningsWrapper.children();
  const meanings = [];

  let mostRecentWordTypes = [];
  for (let meaningIndex = 0; meaningIndex < meaningsChildren.length; meaningIndex += 1) {
    const child = meaningsChildren.eq(meaningIndex);
    if (child.hasClass('meaning-tags')) {
      mostRecentWordTypes = child.text().split(',').map(s => s.trim().toLowerCase());
    } else if (mostRecentWordTypes[0] === 'other forms') {
      returnValues.otherForms = child.text().split('、')
        .map(s => s.replace('【', '').replace('】', '').split(' '))
        .map(a => ({ kanji: a[0], kana: a[1] }));
    } else if (mostRecentWordTypes[0] === 'notes') {
      returnValues.notes = child.text().split('\n');
    } else {
      const meaning = child.find('.meaning-meaning').text();
      const meaningAbstract = child.find('.meaning-abstract')
        .find('a')
        .remove()
        .end()
        .text();

      const supplemental = child.find('.supplemental_info').text().split(',')
        .map(s => s.trim())
        .filter(s => s);

      const seeAlsoTerms = [];
      for (let i = supplemental.length - 1; i >= 0; i -= 1) {
        const supplementalEntry = supplemental[i];
        if (supplementalEntry.startsWith('See also')) {
          seeAlsoTerms.push(supplementalEntry.replace('See also ', ''));
          supplemental.splice(i, 1);
        }
      }

      const sentences = [];
      const sentenceElements = child.find('.sentences').children('.sentence');

      for (let sentenceIndex = 0; sentenceIndex < sentenceElements.length; sentenceIndex += 1) {
        const sentenceElement = sentenceElements.eq(sentenceIndex);

        const english = sentenceElement.find('.english').text();
        const pieces = getPieces(sentenceElement);

        const japanese = sentenceElement
          .find('.english').remove().end()
          .find('.furigana')
          .remove()
          .end()
          .text();

        sentences.push({ english, japanese, pieces });
      }

      meanings.push({
        seeAlsoTerms,
        sentences,
        definition: meaning,
        supplemental,
        definitionAbstract: meaningAbstract,
        tags: mostRecentWordTypes,
      });
    }
  }

  returnValues.meanings = meanings;

  return returnValues;
}

function getAudio($) {
  const audio = [];
  $('.concept_light-status')
    .find('audio > source')
    .each((_, element) => audio.push({
      uri: `https:${element.attribs.src}`,
      mimetype: element.attribs.type,
    }));
  return audio;
}

function uriForPhraseScrape(searchTerm) {
  return `https://jisho.org/word/${encodeURIComponent(searchTerm)}`;
}

function parsePhrasePageData(pageHtml, query) {
  const $ = cheerio.load(pageHtml);
  const { meanings, otherForms, notes } = getMeaningsOtherFormsAndNotes($);
  const audio = getAudio($);

  const result = {
    found: true,
    query,
    uri: uriForPhraseScrape(query),
    tags: getTags($),
    meanings,
    otherForms,
    audio,
    notes,
  };

  return result;
}

/* PHRASE SCRAPE FUNCTIONS END */

/**
 * @typedef {Object} PhraseScrapeSentence
 * @property {string} english The English meaning of the sentence.
 * @property {string} japanese The Japanese text of the sentence.
 * @property {Array.<ExampleSentencePiece>} pieces The lifted/unlifted pairs
 *   that make up the sentence. Lifted text is furigana, unlifted is the text below the furigana.
 */

/**
 * @typedef {Object} PhraseScrapeMeaning
 * @property {Array.<string>} seeAlsoTerms The words that Jisho lists as "see also".
 * @property {Array.<PhraseScrapeSentence>} sentences Example sentences for this meaning.
 * @property {string} definition The definition.
 * @property {Array.<string>} supplemental Supplemental information.
 *   For example "usually written using kana alone".
 * @property {string} definitionAbstract An "abstract" definition.
 *   Often this is a Wikipedia definition.
 * @property {Array.<string>} tags Tags associated with this meaning.
 */

/** @typedef {Object} PhraseScrapeJapaneseWord
 * @property {string} kanji The japanese word, written in kanji if available
 * @property {string} [kana] The corresponding kana spelling of the whole word, if kanji is present
 */

/** @typedef {Object} AudioFile
 * @property {string} uri The uri pointing to the audio file
 * @property {string} mimetype The mimetype of the audio file. Usually mp3 or ogg
 */

/**
 * @typedef {Object} PhrasePageScrapeResult
 * @property {boolean} found True if a result was found.
 * @property {string} query The term that you searched for.
 * @property {string} [uri] The URI that these results were scraped from, if a result was found.
 * @property {Array.<PhraseScrapeJapaneseWord>} [otherForms] Other forms of the search term, if a
 *   result was found.
 * @property {Array.<PhraseScrapeMeaning>} [meanings] Information about the meanings associated
 *   with result.
 * @property {Array.<string>} [tags] Tags associated with this search result.
 * @property {Array.<AudioFile>} [audio] Recordings of the word, in different file formats if
 *   present
 * @property {Array.<string>} [notes] Notes associated with the search result.
 */

/**
 * @typedef {Object} YomiExample
 * @property {string} example The original text of the example.
 * @property {string} reading The reading of the example.
 * @property {string} meaning The meaning of the example.
 */

/**
 * @typedef {Object} KanjiResult
 * @property {boolean} found True if results were found.
 * @property {string} query The term that you searched for.
 * @property {string} [taughtIn] The school level that the kanji is taught in, if applicable.
 * @property {string} [jlptLevel] The lowest JLPT exam that this kanji is likely to
 *   appear in, if applicable. 'N5' or 'N4' or 'N3' or 'N2' or 'N1'.
 * @property {number} [newspaperFrequencyRank] A number representing this kanji's frequency rank
 *   in newspapers, if applicable.
 * @property {number} [strokeCount] How many strokes this kanji is typically drawn in,
 *   if applicable.
 * @property {string} [meaning] The meaning of the kanji, if applicable.
 * @property {Array.<string>} [kunyomi] This character's kunyomi, if applicable.
 * @property {Array.<YomiExample>} [kunyomiExamples] Examples of this character's kunyomi
 *   being used, if applicable.
 * @property {string} [onyomi] This character's onyomi, if applicable.
 * @property {Array.<YomiExample>} [onyomiExamples] Examples of this character's onyomi
 *   being used, if applicable.
 * @property {Object} [radical] Information about this character's radical, if applicable.
 * @property {string} [radical.symbol] The radical symbol, if applicable.
 * @property {Array.<string>} [radical.forms] The radical forms used in this kanji, if applicable.
 * @property {string} [radical.meaning] The meaning of the radical, if applicable.
 * @property {Array.<string>} [parts] The parts used in this kanji, if applicable.
 * @property {string} [strokeOrderDiagramUri] The URL to a diagram showing how to draw this kanji
 *   step by step, if applicable.
 * @property {string} [strokeOrderSvgUri] The URL to an SVG describing how to draw this kanji,
 *   if applicable.
 * @property {string} [strokeOrderGifUri] The URL to a gif showing the kanji being draw and its
 *   stroke order, if applicable.
 * @property {string} [uri] The URI that these results were scraped from, if applicable.
 */

/**
 * @typedef {Object} ExampleSentencePiece
 * @property {string} unlifted Baseline text shown on Jisho.org (below the lifted text / furigana)
 * @property {string} lifted Furigana text shown on Jisho.org (above the unlifted text)
 */

/**
 * @typedef {Object} ExampleResultData
 * @property {string} kanji The example sentence including kanji.
 * @property {string} kana The example sentence without kanji (only kana). Sometimes this may
 *   include some Kanji, as furigana is not always available from Jisho.org.
 * @property {string} english An English translation of the example.
 * @property {Array.<ExampleSentencePiece>} pieces The lifted/unlifted pairs
 *   that make up the sentence. Lifted text is furigana, unlifted is the text below the furigana.
 */

/**
 * @typedef {Object} ExampleResults
 * @property {string} query The term that you searched for.
 * @property {boolean} found True if results were found.
 * @property {string} uri The URI that these results were scraped from.
 * @property {Array.<ExampleResultData>} results The examples that were found, if any.
 */

/**
 * A wrapper around the Jisho search functions.
 */
class API {
  /**
   * Query the official Jisho API for a word or phrase. See
   * [here]{@link https://jisho.org/forum/54fefc1f6e73340b1f160000-is-there-any-kind-of-search-api}
   * for discussion about the official API.
   * @param {string} phrase The search term to search for.
   * @returns {Object} The response data from the official Jisho.org API. Its format is somewhat
   *   complex and is not documented, so put on your trial-and-error hat.
   * @async
   */
  searchForPhrase(phrase, page) {
    const uri = uriForPhraseSearch(phrase, page);
    return axios.get(uri).then(response => response.data);
  }

  /**
   * Scrape the word page for a word/phrase. This allows you to
   * get some information that isn't provided by the official API, such as
   * part-of-speech and JLPT level. However, the official API should be preferred
   * if it has the information you need. This function scrapes https://jisho.org/word/XXX.
   * In general, you'll want to include kanji in your search term, for example 掛かる
   * instead of かかる (no results).
   * @param {string} phrase The search term to search for.
   * @returns {PhrasePageScrapeResult} Information about the searched query.
   * @async
   */
  async scrapeForPhrase(phrase) {
    const uri = uriForPhraseScrape(phrase);
    try {
      const response = await axios.get(uri);
      return parsePhrasePageData(response.data, phrase);
    } catch (err) {
      if (err.response && err.response.status === 404) {
        return {
          query: phrase,
          found: false,
        };
      }

      throw err;
    }
  }

  /**
   * Scrape Jisho.org for information about a kanji character.
   * @param {string} kanji The kanji to search for.
   * @returns {KanjiResult} Information about the searched kanji.
   * @async
   */
  searchForKanji(kanji) {
    const uri = uriForKanjiSearch(kanji);
    return axios.get(uri).then(response => parseKanjiPageData(response.data, kanji));
  }

  /**
   * Scrape Jisho.org for examples.
   * @param {string} phrase The word or phrase to search for.
   * @returns {ExampleResults}
   * @async
   */
  searchForExamples(phrase) {
    const uri = uriForExampleSearch(phrase);
    return axios.get(uri).then(response => parseExamplePageData(response.data, phrase));
  }
}

API.prototype.getUriForKanjiSearch = uriForKanjiSearch;
API.prototype.getUriForExampleSearch = uriForExampleSearch;
API.prototype.getUriForPhraseSearch = uriForPhraseSearch;
API.prototype.getUriForPhraseScrape = uriForPhraseScrape;
API.prototype.parseExamplePageHtml = parseExamplePageData;
API.prototype.parseKanjiPageHtml = parseKanjiPageData;
API.prototype.parsePhraseScrapeHtml = parsePhrasePageData;

export default API;