diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..e9d1a3b31f3c38008b27857ba2ab79728e8612ba --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,47 @@ +{ + "files.associations": { + "atomic": "cpp", + "bit": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "compare": "cpp", + "concepts": "cpp", + "cstddef": "cpp", + "cstdint": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "exception": "cpp", + "initializer_list": "cpp", + "ios": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "iterator": "cpp", + "limits": "cpp", + "memory": "cpp", + "new": "cpp", + "ostream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "typeinfo": "cpp", + "utility": "cpp", + "xfacet": "cpp", + "xiosbase": "cpp", + "xlocale": "cpp", + "xlocinfo": "cpp", + "xlocnum": "cpp", + "xmemory": "cpp", + "xstddef": "cpp", + "xstring": "cpp", + "xtr1common": "cpp", + "xutility": "cpp" + }, + "cmake.ignoreCMakeListsMissing": true +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000000000000000000000000000000000000..a8bd2672163a7d94c4b773924af53ba8b6d5855a --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,28 @@ +{ + "tasks": [ + { + "type": "cppbuild", + "label": "C/C++: g++.exe 生成活动文件", + "command": "C:\\Program Files\\mingw64\\bin\\g++.exe", + "args": [ + "-fdiagnostics-color=always", + "-g", + "${file}", + "-o", + "${fileDirname}\\${fileBasenameNoExtension}.exe" + ], + "options": { + "cwd": "${fileDirname}" + }, + "problemMatcher": [ + "$gcc" + ], + "group": { + "kind": "build", + "isDefault": true + }, + "detail": "调试器生成的任务。" + } + ], + "version": "2.0.0" +} \ No newline at end of file diff --git a/FinGPT-master.zip b/FinGPT-master.zip new file mode 100644 index 0000000000000000000000000000000000000000..e080201157f79b829e7df0d2fb17529282376264 --- /dev/null +++ b/FinGPT-master.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5532a3e67103f1e0ef1d2eef7639ab7ec8e6ca6076e6cbc72d37b472507a5205 +size 11524574 diff --git a/FinGPT.ipynb b/FinGPT.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b762a40d9999b8481ac0a6a2e835619d971b5cbf --- /dev/null +++ b/FinGPT.ipynb @@ -0,0 +1,868 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FinGPT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 1: Preparing the Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.1 Initialize Directories:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "\n", + "jsonl_path = \"data/dataset_new.jsonl\"\n", + "save_path = 'data/dataset_new'\n", + "\n", + "\n", + "if os.path.exists(jsonl_path):\n", + " os.remove(jsonl_path)\n", + "\n", + "if os.path.exists(save_path):\n", + " shutil.rmtree(save_path)\n", + "\n", + "directory = \"data\"\n", + "if not os.path.exists(directory):\n", + " os.makedirs(directory)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.2 Load and Prepare Dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import datasets\n", + "\n", + "dic = {\n", + " 0:\"negative\",\n", + " 1:'positive',\n", + " 2:'neutral',\n", + "}\n", + "\n", + "tfns = load_dataset('zeroshot/twitter-financial-news-sentiment')\n", + "tfns = tfns['train']\n", + "tfns = tfns.to_pandas()\n", + "tfns['label'] = tfns['label'].apply(lambda x:dic[x])\n", + "tfns['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'\n", + "tfns.columns = ['input', 'output', 'instruction']\n", + "tfns = datasets.Dataset.from_pandas(tfns)\n", + "tfns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1.3 Concatenate and Shuffle Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp_dataset = datasets.concatenate_datasets([tfns]*2)\n", + "train_dataset = tmp_dataset\n", + "print(tmp_dataset.num_rows)\n", + "\n", + "all_dataset = train_dataset.shuffle(seed = 42)\n", + "all_dataset.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: Dataset Formatting and Tokenization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.1 Dataset Formatting:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from tqdm.notebook import tqdm\n", + "\n", + "\n", + "def format_example(example: dict) -> dict:\n", + " context = f\"Instruction: {example['instruction']}\\n\"\n", + " if example.get(\"input\"):\n", + " context += f\"Input: {example['input']}\\n\"\n", + " context += \"Answer: \"\n", + " target = example[\"output\"]\n", + " return {\"context\": context, \"target\": target}\n", + "\n", + "\n", + "data_list = []\n", + "for item in all_dataset.to_pandas().itertuples():\n", + " tmp = {}\n", + " tmp[\"instruction\"] = item.instruction\n", + " tmp[\"input\"] = item.input\n", + " tmp[\"output\"] = item.output\n", + " data_list.append(tmp)\n", + "\n", + "\n", + "# save to a jsonl file\n", + "with open(\"data/dataset_new.jsonl\", 'w') as f:\n", + " for example in tqdm(data_list, desc=\"formatting..\"):\n", + " f.write(json.dumps(format_example(example)) + '\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.2 Tokenization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoConfig\n", + "\n", + "model_name = \"THUDM/chatglm2-6b\"\n", + "jsonl_path = \"data/dataset_new.jsonl\" # updated path\n", + "save_path = 'data/dataset_new' # updated path\n", + "max_seq_length = 512\n", + "skip_overlength = True\n", + "\n", + "# The preprocess function tokenizes the prompt and target, combines them into input IDs,\n", + "# and then trims or pads the sequence to the maximum sequence length.\n", + "def preprocess(tokenizer, config, example, max_seq_length):\n", + " prompt = example[\"context\"]\n", + " target = example[\"target\"]\n", + " prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)\n", + " target_ids = tokenizer.encode(\n", + " target,\n", + " max_length=max_seq_length,\n", + " truncation=True,\n", + " add_special_tokens=False)\n", + " input_ids = prompt_ids + target_ids + [config.eos_token_id]\n", + " return {\"input_ids\": input_ids, \"seq_len\": len(prompt_ids)}\n", + "\n", + "# The read_jsonl function reads each line from the JSONL file, preprocesses it using the preprocess function,\n", + "# and then yields each preprocessed example.\n", + "def read_jsonl(path, max_seq_length, skip_overlength=False):\n", + " tokenizer = AutoTokenizer.from_pretrained(\n", + " model_name, trust_remote_code=True)\n", + " config = AutoConfig.from_pretrained(\n", + " model_name, trust_remote_code=True, device_map='auto')\n", + " with open(path, \"r\") as f:\n", + " for line in tqdm(f.readlines()):\n", + " example = json.loads(line)\n", + " feature = preprocess(tokenizer, config, example, max_seq_length)\n", + " if skip_overlength and len(feature[\"input_ids\"]) > max_seq_length:\n", + " continue\n", + " feature[\"input_ids\"] = feature[\"input_ids\"][:max_seq_length]\n", + " yield feature" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2.3 Save the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The script then creates a Hugging Face Dataset object from the generator and saves it to disk.\n", + "save_path = './data/dataset_new'\n", + "\n", + "dataset = datasets.Dataset.from_generator(\n", + " lambda: read_jsonl(jsonl_path, max_seq_length, skip_overlength)\n", + " )\n", + "dataset.save_to_disk(save_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 3: Setup FinGPT training parameters with LoRA on ChatGlm2–6b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1 Training Arguments Setup:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "W0801 20:19:58.973000 23260 site-packages\\torch\\distributed\\elastic\\multiprocessing\\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.\n" + ] + } + ], + "source": [ + "from typing import List, Dict, Optional\n", + "import torch\n", + "from loguru import logger\n", + "from transformers import (\n", + " AutoModel,\n", + " AutoTokenizer,\n", + " TrainingArguments,\n", + " Trainer,\n", + " BitsAndBytesConfig\n", + ")\n", + "from peft import (\n", + " TaskType,\n", + " LoraConfig,\n", + " get_peft_model,\n", + " set_peft_model_state_dict,\n", + " prepare_model_for_kbit_training,\n", + " prepare_model_for_int8_training,\n", + ")\n", + "from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir='./finetuned_model', # saved model path\n", + " # max_steps=10000,\n", + " num_train_epochs = 2,\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=8,\n", + " learning_rate=1e-4,\n", + " weight_decay=0.01,\n", + " warmup_steps=10,\n", + " save_steps=50,\n", + " fp16=True,\n", + " # bf16=True,\n", + " torch_compile = False,\n", + " load_best_model_at_end = True,\n", + " evaluation_strategy=\"steps\",\n", + " remove_unused_columns=False,\n", + " logging_steps = 50,\n", + " eval_steps = 50,\n", + " logging_dir='./logs',\n", + " report_to=\"tensorboard\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.2 Quantization Config Setup:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Quantization\n", + "q_config = BitsAndBytesConfig(load_in_4bit=True,\n", + " bnb_4bit_quant_type='nf4',\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_compute_dtype=torch.float16\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.3 Model Loading & Preparation:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\huggingface_hub\\file_download.py:945: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4bc8b4c85e974cfe806fda92d57ad1c3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/7 [00:00 to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is\n", + ":DefaultFlowCallback\n", + "TensorBoardCallback\n", + "d:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\transformers\\optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "27521448ffea440ba40770ef24937509", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/954 [00:00 \u001b[39m\u001b[32m63\u001b[39m trainer.train()\n\u001b[32m 64\u001b[39m writer.close()\n\u001b[32m 65\u001b[39m \u001b[38;5;66;03m# save model\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\transformers\\trainer.py:1645\u001b[39m, in \u001b[36mTrainer.train\u001b[39m\u001b[34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[39m\n\u001b[32m 1640\u001b[39m \u001b[38;5;28mself\u001b[39m.model_wrapped = \u001b[38;5;28mself\u001b[39m.model\n\u001b[32m 1642\u001b[39m inner_training_loop = find_executable_batch_size(\n\u001b[32m 1643\u001b[39m \u001b[38;5;28mself\u001b[39m._inner_training_loop, \u001b[38;5;28mself\u001b[39m._train_batch_size, args.auto_find_batch_size\n\u001b[32m 1644\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1645\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m inner_training_loop(\n\u001b[32m 1646\u001b[39m args=args,\n\u001b[32m 1647\u001b[39m resume_from_checkpoint=resume_from_checkpoint,\n\u001b[32m 1648\u001b[39m trial=trial,\n\u001b[32m 1649\u001b[39m ignore_keys_for_eval=ignore_keys_for_eval,\n\u001b[32m 1650\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\transformers\\trainer.py:1938\u001b[39m, in \u001b[36mTrainer._inner_training_loop\u001b[39m\u001b[34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[39m\n\u001b[32m 1935\u001b[39m \u001b[38;5;28mself\u001b[39m.control = \u001b[38;5;28mself\u001b[39m.callback_handler.on_step_begin(args, \u001b[38;5;28mself\u001b[39m.state, \u001b[38;5;28mself\u001b[39m.control)\n\u001b[32m 1937\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.accelerator.accumulate(model):\n\u001b[32m-> \u001b[39m\u001b[32m1938\u001b[39m tr_loss_step = \u001b[38;5;28mself\u001b[39m.training_step(model, inputs)\n\u001b[32m 1940\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[32m 1941\u001b[39m args.logging_nan_inf_filter\n\u001b[32m 1942\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_tpu_available()\n\u001b[32m 1943\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m (torch.isnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch.isinf(tr_loss_step))\n\u001b[32m 1944\u001b[39m ):\n\u001b[32m 1945\u001b[39m \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[32m 1946\u001b[39m tr_loss += tr_loss / (\u001b[32m1\u001b[39m + \u001b[38;5;28mself\u001b[39m.state.global_step - \u001b[38;5;28mself\u001b[39m._globalstep_last_logged)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\transformers\\trainer.py:2759\u001b[39m, in \u001b[36mTrainer.training_step\u001b[39m\u001b[34m(self, model, inputs)\u001b[39m\n\u001b[32m 2756\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb.reduce_mean().detach().to(\u001b[38;5;28mself\u001b[39m.args.device)\n\u001b[32m 2758\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m.compute_loss_context_manager():\n\u001b[32m-> \u001b[39m\u001b[32m2759\u001b[39m loss = \u001b[38;5;28mself\u001b[39m.compute_loss(model, inputs)\n\u001b[32m 2761\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.args.n_gpu > \u001b[32m1\u001b[39m:\n\u001b[32m 2762\u001b[39m loss = loss.mean() \u001b[38;5;66;03m# mean() to average on multi-gpu parallel training\u001b[39;00m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[6]\u001b[39m\u001b[32m, line 3\u001b[39m, in \u001b[36mModifiedTrainer.compute_loss\u001b[39m\u001b[34m(self, model, inputs)\u001b[39m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_loss\u001b[39m(\u001b[38;5;28mself\u001b[39m, model, inputs):\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m model(\n\u001b[32m 4\u001b[39m input_ids=inputs[\u001b[33m\"\u001b[39m\u001b[33minput_ids\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 5\u001b[39m labels=inputs[\u001b[33m\"\u001b[39m\u001b[33mlabels\u001b[39m\u001b[33m\"\u001b[39m],\n\u001b[32m 6\u001b[39m ).loss\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_impl(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(*args, **kwargs)\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\accelerate\\utils\\operations.py:687\u001b[39m, in \u001b[36mconvert_outputs_to_fp32..forward\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 686\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mforward\u001b[39m(*args, **kwargs):\n\u001b[32m--> \u001b[39m\u001b[32m687\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m model_forward(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\accelerate\\utils\\operations.py:675\u001b[39m, in \u001b[36mConvertOutputsToFp32.__call__\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 674\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, *args, **kwargs):\n\u001b[32m--> \u001b[39m\u001b[32m675\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m convert_to_fp32(\u001b[38;5;28mself\u001b[39m.model_forward(*args, **kwargs))\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\amp\\autocast_mode.py:44\u001b[39m, in \u001b[36mautocast_decorator..decorate_autocast\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;129m@functools\u001b[39m.wraps(func)\n\u001b[32m 42\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mdecorate_autocast\u001b[39m(*args, **kwargs):\n\u001b[32m 43\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m autocast_instance:\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m func(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\peft\\peft_model.py:1091\u001b[39m, in \u001b[36mPeftModelForCausalLM.forward\u001b[39m\u001b[34m(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)\u001b[39m\n\u001b[32m 1089\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m peft_config.peft_type == PeftType.POLY:\n\u001b[32m 1090\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mtask_ids\u001b[39m\u001b[33m\"\u001b[39m] = task_ids\n\u001b[32m-> \u001b[39m\u001b[32m1091\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.base_model(\n\u001b[32m 1092\u001b[39m input_ids=input_ids,\n\u001b[32m 1093\u001b[39m attention_mask=attention_mask,\n\u001b[32m 1094\u001b[39m inputs_embeds=inputs_embeds,\n\u001b[32m 1095\u001b[39m labels=labels,\n\u001b[32m 1096\u001b[39m output_attentions=output_attentions,\n\u001b[32m 1097\u001b[39m output_hidden_states=output_hidden_states,\n\u001b[32m 1098\u001b[39m return_dict=return_dict,\n\u001b[32m 1099\u001b[39m **kwargs,\n\u001b[32m 1100\u001b[39m )\n\u001b[32m 1102\u001b[39m batch_size = _get_batch_size(input_ids, inputs_embeds)\n\u001b[32m 1103\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attention_mask \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 1104\u001b[39m \u001b[38;5;66;03m# concat prompt attention mask\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_impl(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(*args, **kwargs)\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\peft\\tuners\\tuners_utils.py:160\u001b[39m, in \u001b[36mBaseTuner.forward\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 159\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, *args: Any, **kwargs: Any):\n\u001b[32m--> \u001b[39m\u001b[32m160\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m.model.forward(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\accelerate\\hooks.py:165\u001b[39m, in \u001b[36madd_hook_to_module..new_forward\u001b[39m\u001b[34m(module, *args, **kwargs)\u001b[39m\n\u001b[32m 163\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 164\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m165\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 166\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m module._hf_hook.post_forward(module, output)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.cache\\huggingface\\modules\\transformers_modules\\THUDM\\chatglm2-6b\\d2e2d91789248536a747d9ce60642a336444186c\\modeling_chatglm.py:937\u001b[39m, in \u001b[36mChatGLMForConditionalGeneration.forward\u001b[39m\u001b[34m(self, input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, return_last_logit)\u001b[39m\n\u001b[32m 934\u001b[39m use_cache = use_cache \u001b[38;5;28;01mif\u001b[39;00m use_cache \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.use_cache\n\u001b[32m 935\u001b[39m return_dict = return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m.config.use_return_dict\n\u001b[32m--> \u001b[39m\u001b[32m937\u001b[39m transformer_outputs = \u001b[38;5;28mself\u001b[39m.transformer(\n\u001b[32m 938\u001b[39m input_ids=input_ids,\n\u001b[32m 939\u001b[39m position_ids=position_ids,\n\u001b[32m 940\u001b[39m attention_mask=attention_mask,\n\u001b[32m 941\u001b[39m past_key_values=past_key_values,\n\u001b[32m 942\u001b[39m inputs_embeds=inputs_embeds,\n\u001b[32m 943\u001b[39m use_cache=use_cache,\n\u001b[32m 944\u001b[39m output_hidden_states=output_hidden_states,\n\u001b[32m 945\u001b[39m return_dict=return_dict,\n\u001b[32m 946\u001b[39m )\n\u001b[32m 948\u001b[39m hidden_states = transformer_outputs[\u001b[32m0\u001b[39m]\n\u001b[32m 949\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m return_last_logit:\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_impl(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(*args, **kwargs)\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\accelerate\\hooks.py:165\u001b[39m, in \u001b[36madd_hook_to_module..new_forward\u001b[39m\u001b[34m(module, *args, **kwargs)\u001b[39m\n\u001b[32m 163\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 164\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m165\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 166\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m module._hf_hook.post_forward(module, output)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.cache\\huggingface\\modules\\transformers_modules\\THUDM\\chatglm2-6b\\d2e2d91789248536a747d9ce60642a336444186c\\modeling_chatglm.py:830\u001b[39m, in \u001b[36mChatGLMModel.forward\u001b[39m\u001b[34m(self, input_ids, position_ids, attention_mask, full_attention_mask, past_key_values, inputs_embeds, use_cache, output_hidden_states, return_dict)\u001b[39m\n\u001b[32m 827\u001b[39m rotary_pos_emb = rotary_pos_emb.transpose(\u001b[32m0\u001b[39m, \u001b[32m1\u001b[39m).contiguous()\n\u001b[32m 829\u001b[39m \u001b[38;5;66;03m# Run encoder.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m830\u001b[39m hidden_states, presents, all_hidden_states, all_self_attentions = \u001b[38;5;28mself\u001b[39m.encoder(\n\u001b[32m 831\u001b[39m inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb,\n\u001b[32m 832\u001b[39m kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states\n\u001b[32m 833\u001b[39m )\n\u001b[32m 835\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_dict:\n\u001b[32m 836\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mtuple\u001b[39m(v \u001b[38;5;28;01mfor\u001b[39;00m v \u001b[38;5;129;01min\u001b[39;00m [hidden_states, presents, all_hidden_states, all_self_attentions] \u001b[38;5;28;01mif\u001b[39;00m v \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_impl(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(*args, **kwargs)\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\accelerate\\hooks.py:165\u001b[39m, in \u001b[36madd_hook_to_module..new_forward\u001b[39m\u001b[34m(module, *args, **kwargs)\u001b[39m\n\u001b[32m 163\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 164\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m165\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 166\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m module._hf_hook.post_forward(module, output)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.cache\\huggingface\\modules\\transformers_modules\\THUDM\\chatglm2-6b\\d2e2d91789248536a747d9ce60642a336444186c\\modeling_chatglm.py:631\u001b[39m, in \u001b[36mGLMTransformer.forward\u001b[39m\u001b[34m(self, hidden_states, attention_mask, rotary_pos_emb, kv_caches, use_cache, output_hidden_states)\u001b[39m\n\u001b[32m 629\u001b[39m layer = \u001b[38;5;28mself\u001b[39m._get_layer(index)\n\u001b[32m 630\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.gradient_checkpointing \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.training:\n\u001b[32m--> \u001b[39m\u001b[32m631\u001b[39m layer_ret = torch.utils.checkpoint.checkpoint(\n\u001b[32m 632\u001b[39m layer,\n\u001b[32m 633\u001b[39m hidden_states,\n\u001b[32m 634\u001b[39m attention_mask,\n\u001b[32m 635\u001b[39m rotary_pos_emb,\n\u001b[32m 636\u001b[39m kv_caches[index],\n\u001b[32m 637\u001b[39m use_cache\n\u001b[32m 638\u001b[39m )\n\u001b[32m 639\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 640\u001b[39m layer_ret = layer(\n\u001b[32m 641\u001b[39m hidden_states,\n\u001b[32m 642\u001b[39m attention_mask,\n\u001b[32m (...)\u001b[39m\u001b[32m 645\u001b[39m use_cache=use_cache\n\u001b[32m 646\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\_compile.py:53\u001b[39m, in \u001b[36m_disable_dynamo..inner\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 50\u001b[39m disable_fn = torch._dynamo.disable(fn, recursive, wrapping=\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[32m 51\u001b[39m fn.__dynamo_disable = disable_fn \u001b[38;5;66;03m# type: ignore[attr-defined]\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m53\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m disable_fn(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\_dynamo\\eval_frame.py:1005\u001b[39m, in \u001b[36mDisableContext.__call__.._fn\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 1003\u001b[39m _maybe_set_eval_frame(_callback_from_stance(\u001b[38;5;28mself\u001b[39m.callback))\n\u001b[32m 1004\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1005\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m fn(*args, **kwargs)\n\u001b[32m 1006\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 1007\u001b[39m set_eval_frame(\u001b[38;5;28;01mNone\u001b[39;00m)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\utils\\checkpoint.py:488\u001b[39m, in \u001b[36mcheckpoint\u001b[39m\u001b[34m(function, use_reentrant, context_fn, determinism_check, debug, *args, **kwargs)\u001b[39m\n\u001b[32m 483\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m context_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m noop_context_fn \u001b[38;5;129;01mor\u001b[39;00m debug \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m:\n\u001b[32m 484\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 485\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mPassing `context_fn` or `debug` is only supported when \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 486\u001b[39m \u001b[33m\"\u001b[39m\u001b[33muse_reentrant=False.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 487\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m488\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m CheckpointFunction.apply(function, preserve, *args)\n\u001b[32m 489\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 490\u001b[39m gen = _checkpoint_without_reentrant_generator(\n\u001b[32m 491\u001b[39m function, preserve, context_fn, determinism_check, debug, *args, **kwargs\n\u001b[32m 492\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\autograd\\function.py:581\u001b[39m, in \u001b[36mFunction.apply\u001b[39m\u001b[34m(cls, *args, **kwargs)\u001b[39m\n\u001b[32m 578\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m torch._C._are_functorch_transforms_active():\n\u001b[32m 579\u001b[39m \u001b[38;5;66;03m# See NOTE: [functorch vjp and autograd interaction]\u001b[39;00m\n\u001b[32m 580\u001b[39m args = _functorch.utils.unwrap_dead_wrappers(args)\n\u001b[32m--> \u001b[39m\u001b[32m581\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m().apply(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 583\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_setup_ctx_defined:\n\u001b[32m 584\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 585\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mIn order to use an autograd.Function with functorch transforms \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 586\u001b[39m \u001b[33m\"\u001b[39m\u001b[33m(vmap, grad, jvp, jacrev, ...), it must override the setup_context \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 587\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mstaticmethod. For more details, please see \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 588\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mhttps://pytorch.org/docs/main/notes/extending.func.html\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 589\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\utils\\checkpoint.py:262\u001b[39m, in \u001b[36mCheckpointFunction.forward\u001b[39m\u001b[34m(ctx, run_function, preserve_rng_state, *args)\u001b[39m\n\u001b[32m 259\u001b[39m ctx.save_for_backward(*tensor_inputs)\n\u001b[32m 261\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m--> \u001b[39m\u001b[32m262\u001b[39m outputs = run_function(*args)\n\u001b[32m 263\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1773\u001b[39m, in \u001b[36mModule._wrapped_call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1771\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._compiled_call_impl(*args, **kwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[32m 1772\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m-> \u001b[39m\u001b[32m1773\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m._call_impl(*args, **kwargs)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1784\u001b[39m, in \u001b[36mModule._call_impl\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1779\u001b[39m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[32m 1780\u001b[39m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[32m 1781\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m._backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m._forward_pre_hooks\n\u001b[32m 1782\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[32m 1783\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[32m-> \u001b[39m\u001b[32m1784\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m forward_call(*args, **kwargs)\n\u001b[32m 1786\u001b[39m result = \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[32m 1787\u001b[39m called_always_called_hooks = \u001b[38;5;28mset\u001b[39m()\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\accelerate\\hooks.py:165\u001b[39m, in \u001b[36madd_hook_to_module..new_forward\u001b[39m\u001b[34m(module, *args, **kwargs)\u001b[39m\n\u001b[32m 163\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 164\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m165\u001b[39m output = module._old_forward(*args, **kwargs)\n\u001b[32m 166\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m module._hf_hook.post_forward(module, output)\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/.cache\\huggingface\\modules\\transformers_modules\\THUDM\\chatglm2-6b\\d2e2d91789248536a747d9ce60642a336444186c\\modeling_chatglm.py:562\u001b[39m, in \u001b[36mGLMBlock.forward\u001b[39m\u001b[34m(self, hidden_states, attention_mask, rotary_pos_emb, kv_cache, use_cache)\u001b[39m\n\u001b[32m 559\u001b[39m layernorm_input = residual + layernorm_input\n\u001b[32m 561\u001b[39m \u001b[38;5;66;03m# Layer norm post the self attention.\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m562\u001b[39m layernorm_output = \u001b[38;5;28mself\u001b[39m.post_attention_layernorm(layernorm_input)\n\u001b[32m 564\u001b[39m \u001b[38;5;66;03m# MLP.\u001b[39;00m\n\u001b[32m 565\u001b[39m mlp_output = \u001b[38;5;28mself\u001b[39m.mlp(layernorm_output)\n", + "\u001b[36mFile \u001b[39m\u001b[32md:\\anaconda\\envs\\fingpt-env\\Lib\\site-packages\\torch\\nn\\modules\\module.py:1949\u001b[39m, in \u001b[36mModule.__getattr__\u001b[39m\u001b[34m(self, name)\u001b[39m\n\u001b[32m 1944\u001b[39m \u001b[38;5;28mself\u001b[39m._backward_pre_hooks = OrderedDict()\n\u001b[32m 1946\u001b[39m \u001b[38;5;66;03m# It is crucial that the return type is not annotated as `Any`, otherwise type checking\u001b[39;00m\n\u001b[32m 1947\u001b[39m \u001b[38;5;66;03m# on `torch.nn.Module` and all its subclasses is largely disabled as a result. See:\u001b[39;00m\n\u001b[32m 1948\u001b[39m \u001b[38;5;66;03m# https://github.com/pytorch/pytorch/pull/115074\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1949\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__getattr__\u001b[39m(\u001b[38;5;28mself\u001b[39m, name: \u001b[38;5;28mstr\u001b[39m) -> Union[Tensor, \u001b[33m\"\u001b[39m\u001b[33mModule\u001b[39m\u001b[33m\"\u001b[39m]:\n\u001b[32m 1950\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[33m\"\u001b[39m\u001b[33m_parameters\u001b[39m\u001b[33m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.\u001b[34m__dict__\u001b[39m:\n\u001b[32m 1951\u001b[39m _parameters = \u001b[38;5;28mself\u001b[39m.\u001b[34m__dict__\u001b[39m[\u001b[33m\"\u001b[39m\u001b[33m_parameters\u001b[39m\u001b[33m\"\u001b[39m]\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: " + ] + } + ], + "source": [ + "class ModifiedTrainer(Trainer):\n", + " def compute_loss(self, model, inputs):\n", + " return model(\n", + " input_ids=inputs[\"input_ids\"],\n", + " labels=inputs[\"labels\"],\n", + " ).loss\n", + "\n", + " def prediction_step(self, model: torch.nn.Module, inputs, prediction_loss_only: bool, ignore_keys = None):\n", + " with torch.no_grad():\n", + " res = model(\n", + " input_ids=inputs[\"input_ids\"].to(model.device),\n", + " labels=inputs[\"labels\"].to(model.device),\n", + " ).loss\n", + " return (res, None, None)\n", + "\n", + " def save_model(self, output_dir=None, _internal_call=False):\n", + " from transformers.trainer import TRAINING_ARGS_NAME\n", + "\n", + " os.makedirs(output_dir, exist_ok=True)\n", + " torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))\n", + " saved_params = {\n", + " k: v.to(\"cpu\") for k, v in self.model.named_parameters() if v.requires_grad\n", + " }\n", + " torch.save(saved_params, os.path.join(output_dir, \"adapter_model.bin\"))\n", + "\n", + "def data_collator(features: list) -> dict:\n", + " len_ids = [len(feature[\"input_ids\"]) for feature in features]\n", + " longest = max(len_ids)\n", + " input_ids = []\n", + " labels_list = []\n", + " for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):\n", + " ids = feature[\"input_ids\"]\n", + " seq_len = feature[\"seq_len\"]\n", + " labels = (\n", + " [tokenizer.pad_token_id] * (seq_len - 1) + ids[(seq_len - 1) :] + [tokenizer.pad_token_id] * (longest - ids_l)\n", + " )\n", + " ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)\n", + " _ids = torch.LongTensor(ids)\n", + " labels_list.append(torch.LongTensor(labels))\n", + " input_ids.append(_ids)\n", + " input_ids = torch.stack(input_ids)\n", + " labels = torch.stack(labels_list)\n", + " return {\n", + " \"input_ids\": input_ids,\n", + " \"labels\": labels,\n", + " }\n", + "\n", + "from torch.utils.tensorboard import SummaryWriter\n", + "from transformers.integrations import TensorBoardCallback\n", + "\n", + "# Train\n", + "# Took about 10 compute units\n", + "# Took 1 hour to train\n", + "writer = SummaryWriter()\n", + "trainer = ModifiedTrainer(\n", + " model=model,\n", + " args=training_args, # Trainer args\n", + " train_dataset=dataset[\"train\"], # Training set\n", + " eval_dataset=dataset[\"test\"], # Testing set\n", + " data_collator=data_collator, # Data Collator\n", + " callbacks=[TensorBoardCallback(writer)],\n", + ")\n", + "trainer.train()\n", + "writer.close()\n", + "# save model\n", + "model.save_pretrained(training_args.output_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 5: Inference and Benchmarks using FinGPT" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.1 Load the model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path exists.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "014c4259e386457ca0892de97c5a8ec3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/7 [00:00 +image + + +# FinNLP: Internet-scale Financial Data + +[![Downloads](https://static.pepy.tech/badge/finnlp)]([https://pepy.tech/project/finnlp](https://pepy.tech/project/finnlp)) +[![Downloads](https://static.pepy.tech/badge/finnlp/week)](https://pepy.tech/project/finnlp) +[![Python 3.8](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) +[![PyPI](https://img.shields.io/pypi/v/finnlp.svg)](https://pypi.org/project/finnlp/) +![License](https://img.shields.io/github/license/AI4Finance-Foundation/finnlp.svg?color=brightgreen) + +FinNLP provides a playground for all people interested in LLMs and NLP in Finance. Here we provide full pipelines for LLM training and finetuning in the field of finance. + +![Visitors](https://api.visitorbadge.io/api/VisitorHit?user=AI4Finance-Foundation&repo=FinNLP&countColor=%23B17A) + + +## Ⅰ. How to Use + +### 1. News + +* US + + ``` python + # Finnhub (Yahoo Finance, Reuters, SeekingAlpha, CNBC...) + from finnlp.data_sources.news.finnhub_date_range import Finnhub_Date_Range + + start_date = "2023-01-01" + end_date = "2023-01-03" + config = { + "use_proxy": "us_free", # use proxies to prvent ip blocking + "max_retry": 5, + "proxy_pages": 5, + "token": "YOUR_FINNHUB_TOKEN" # Available at https://finnhub.io/dashboard + } + + news_downloader = Finnhub_Date_Range(config) # init + news_downloader.download_date_range_stock(start_date,end_date) # Download headers + news_downloader.gather_content() # Download contents + df = news_downloader.dataframe + selected_columns = ["headline", "content"] + df[selected_columns].head(10) + + -------------------- + + # headline content + # 0 My 26-Stock $349k Portfolio Gets A Nice Petrob... Home\nInvesting Strategy\nPortfolio Strategy\n... + # 1 Apple’s Market Cap Slides Below $2 Trillion fo... Error + # 2 US STOCKS-Wall St starts the year with a dip; ... (For a Reuters live blog on U.S., UK and Europ... + # 3 Buy 4 January Dogs Of The Dow, Watch 4 More Home\nDividends\nDividend Quick Picks\nBuy 4 J... + # 4 Apple's stock market value falls below $2 tril... Jan 3 (Reuters) - Apple Inc's \n(AAPL.O)\n sto... + # 5 CORRECTED-UPDATE 1-Apple's stock market value ... Jan 3 (Reuters) - Apple Inc's \n(AAPL.O)\n sto... + # 6 Apple Stock Falls Amid Report Of Product Order... Apple stock got off to a slow start in 2023 as... + # 7 US STOCKS-Wall St starts the year with a dip; ... Summary\nCompanies\nTesla shares plunge on Q4 ... + # 8 More than $1 trillion wiped off value of Apple... apple store\nMore than $1 trillion has been wi... + # 9 McLean's Iridium inks agreement to put its sat... The company hasn't named its partner, but it's... + ``` + + + +* China + + ``` python + # Sina Finance + from finnlp.data_sources.news.sina_finance_date_range import Sina_Finance_Date_Range + + start_date = "2016-01-01" + end_date = "2016-01-02" + config = { + "use_proxy": "china_free", # use proxies to prvent ip blocking + "max_retry": 5, + "proxy_pages": 5, + } + + news_downloader = Sina_Finance_Date_Range(config) # init + news_downloader.download_date_range_all(start_date,end_date) # Download headers + news_downloader.gather_content() # Download contents + df = news_downloader.dataframe + selected_columns = ["title", "content"] + df[selected_columns].head(10) + + -------------------- + + # title content + # 0 分析师:伊朗重回国际原油市场无法阻止 新浪美股讯 北京时间1月1日晚CNBC称,加拿大皇家银行(RBC)分析师Helima Cro... + # 1 FAA:波音767的逃生扶梯存在缺陷 新浪美股讯 北京时间1日晚,美国联邦航空局(FAA)要求航空公司对波音767机型的救生扶梯进... + # 2 非制造业新订单指数创新高 需求回升力度明显 中新社北京1月1日电 (记者 刘长忠)记者1日从中国物流与采购联合会获悉,在最新发布的201... + # 3 雷曼兄弟针对大和证券提起索赔诉讼 新浪美股讯 北京时间1日下午共同社称,2008年破产的美国金融巨头雷曼兄弟公司的清算法人日前... + # 4 国内钢铁PMI有所回升 钢市低迷形势有所改善 新华社上海1月1日专电(记者李荣)据中物联钢铁物流专业委员会1日发布的指数报告,2015年1... + # 5 马息岭凸显朝鲜旅游体育战略 新浪美股北京时间1日讯 三位单板滑雪手将成为最早拜访马息岭滑雪场的西方专业运动员,他们本月就... + # 6 五洲船舶破产清算 近十年来首现国有船厂倒闭 (原标题:中国首家国有船厂破产倒闭)\n低迷的中国造船市场,多年来首次出现国有船厂破产清算的... + # 7 过半城市房价环比上涨 百城住宅均价加速升温 资料图。中新社记者 武俊杰 摄\n中新社北京1月1日电 (记者 庞无忌)中国房地产市场在20... + # 8 经济学人:巴西病根到底在哪里 新浪美股北京时间1日讯 原本,巴西人是该高高兴兴迎接2016年的。8月间,里约热内卢将举办南... + # 9 中国首家国有船厂破产倒闭:五洲船舶目前已停工 低迷的中国造船市场,多年来首次出现国有船厂破产清算的一幕。浙江海运集团旗下的五洲船舶修造公司... + + # Eastmoney 东方财富 + from finnlp.data_sources.news.eastmoney_streaming import Eastmoney_Streaming + + pages = 3 + stock = "600519" + config = { + "use_proxy": "china_free", + "max_retry": 5, + "proxy_pages": 5, + } + + news_downloader = Eastmoney_Streaming(config) + news_downloader.download_streaming_stock(stock,pages) + df = news_downloader.dataframe + selected_columns = ["title", "create time"] + df[selected_columns].head(10) + + -------------------- + + # title create time + # 0 茅台2022年报的12个小秘密 04-09 19:40 + # 1 东北证券维持贵州茅台买入评级 预计2023年净利润同比 04-09 11:24 + # 2 贵州茅台:融资余额169.34亿元,创近一年新低(04-07 04-08 07:30 + # 3 贵州茅台:融资净买入1248.48万元,融资余额169.79亿 04-07 07:28 + # 4 贵州茅台公益基金会正式成立 04-06 12:29 + # 5 贵州茅台04月04日获沪股通增持19.55万股 04-05 07:48 + # 6 贵州茅台:融资余额169.66亿元,创近一年新低(04-04 04-05 07:30 + # 7 4月4日北向资金最新动向(附十大成交股) 04-04 18:48 + # 8 大宗交易:贵州茅台成交235.9万元,成交价1814.59元( 04-04 17:21 + # 9 第一上海证券维持贵州茅台买入评级 目标价2428.8元 04-04 09:30 + ``` + +### 2. Social Media + +* US + + ``` python + # Stocktwits + from finnlp.data_sources.social_media.stocktwits_streaming import Stocktwits_Streaming + + pages = 3 + stock = "AAPL" + config = { + "use_proxy": "us_free", + "max_retry": 5, + "proxy_pages": 2, + } + + downloader = Stocktwits_Streaming(config) + downloader.download_date_range_stock(stock, pages) + selected_columns = ["created_at", "body"] + downloader.dataframe[selected_columns].head(10) + + -------------------- + + # created_at body + # 0 2023-04-07T15:24:22Z NANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL... + # 1 2023-04-07T15:17:43Z $AAPL $SPY \n \nhttps://amp.scmp.com/news/chi... + # 2 2023-04-07T15:17:25Z $AAPL $GOOG $AMZN I took a Trump today. \n\nH... + # 3 2023-04-07T15:16:54Z $SPY $AAPL will take this baby down, time for ... + # 4 2023-04-07T15:11:37Z $SPY $3T it ALREADY DID - look at the pre-COV... + # 5 2023-04-07T15:10:29Z $AAPL $QQQ $STUDY We are on to the next one! A... + # 6 2023-04-07T15:06:00Z $AAPL was analyzed by 48 analysts. The buy con... + # 7 2023-04-07T14:54:29Z $AAPL both retiring. \n \nCraig.... + # 8 2023-04-07T14:40:06Z $SPY $QQQ $TSLA $AAPL SPY 500 HAS STARTED🚀😍 BI... + # 9 2023-04-07T14:38:57Z Nancy 🩵 (Tim) $AAPL + ``` + + ``` python + # Reddit Wallstreetbets + from finnlp.data_sources.social_media.reddit_streaming import Reddit_Streaming + + pages = 3 + config = { + "use_proxy": "us_free", + "max_retry": 5, + "proxy_pages": 2, + } + + downloader = Reddit_Streaming(config) + downloader.download_streaming_all(pages) + selected_columns = ["created", "title"] + downloader.dataframe[selected_columns].head(10) + + -------------------- + + # created title + # 0 2023-04-07 15:39:34 Y’all making me feel like spooderman + # 1 2022-12-21 04:09:42 Do you track your investments in a spreadsheet... + # 2 2022-12-21 04:09:42 Do you track your investments in a spreadsheet... + # 3 2023-04-07 15:29:23 Can a Blackberry holder get some help 🥺 + # 4 2023-04-07 14:49:55 The week of CPI and FOMC Minutes… 4-6-23 SPY/ ... + # 5 2023-04-07 14:19:22 Well let’s hope your job likes you, thanks Jerome + # 6 2023-04-07 14:06:32 Does anyone else feel an overwhelming sense of... + # 7 2023-04-07 13:47:59 Watermarked Jesus explains the market being cl... + # 8 2023-04-07 13:26:23 Jobs report shows 236,000 gain in March. Hot l... + # 9 2023-04-07 13:07:15 The recession is over! Let's buy more stocks! + ``` + +* China (Weibo) + + ``` python + # Weibo + from finnlp.data_sources.social_media.weibo_date_range import Weibo_Date_Range + + start_date = "2016-01-01" + end_date = "2016-01-02" + stock = "茅台" + config = { + "use_proxy": "china_free", + "max_retry": 5, + "proxy_pages": 5, + "cookies": "Your_Login_Cookies", + } + + downloader = Weibo_Date_Range(config) + downloader.download_date_range_stock(start_date, end_date, stock = stock) + df = downloader.dataframe + df = df.drop_duplicates() + selected_columns = ["date", "content"] + df[selected_columns].head(10) + + -------------------- + + # date content + # 0 2016-01-01 #舆论之锤#唯品会发声明证实销售假茅台-手机腾讯网O网页链接分享来自浏览器! + # 2 2016-01-01 2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原... + # 6 2016-01-01 2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原... + # 17 2016-01-01 开心,今天喝了两斤酒(茅台+扎二)三个人,开心! + # 18 2016-01-01 一家专卖假货的网站某宝,你该学学了!//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品... + # 19 2016-01-01 一家专卖假货的网站//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品会售假茅台:供货商... + # 20 2016-01-01 前几天说了几点不看好茅台的理由,今年过节喝点茅台支持下,个人口感,茅台比小五好喝,茅台依然是... + # 21 2016-01-01 老杜酱酒已到货,从明天起正式在甘肃武威开卖。可以不相信我说的话,但一定不要怀疑@杜子建的为人... + # 22 2016-01-01 【唯品会售假茅台后续:供货商被刑拘顾客获十倍补偿】此前,有网友投诉其在唯品会购买的茅台酒质量... + # 23 2016-01-01 唯品会卖假茅台,供货商被刑拘,买家获十倍补偿8888元|此前,有网友在网络论坛发贴(唯品会宣... + ``` + +### 3. Company Announcement + +* US + + ``` python + # SEC + from finnlp.data_sources.company_announcement.sec import SEC_Announcement + + start_date = "2020-01-01" + end_date = "2020-06-01" + stock = "AAPL" + config = { + "use_proxy": "us_free", + "max_retry": 5, + "proxy_pages": 3, + } + + downloader = SEC_Announcement(config) + downloader.download_date_range_stock(start_date, end_date, stock = stock) + selected_columns = ["file_date", "display_names", "content"] + downloader.dataframe[selected_columns].head(10) + + -------------------- + + # file_date display_names content + # 0 2020-05-12 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 1 2020-04-30 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 2 2020-04-17 [O'BRIEN DEIRDRE (CIK 0001767094), Apple Inc.... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 3 2020-04-17 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 4 2020-04-09 [Maestri Luca (CIK 0001513362), Apple Inc. (... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 5 2020-04-03 [WILLIAMS JEFFREY E (CIK 0001496686), Apple I... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 6 2020-04-03 [Maestri Luca (CIK 0001513362), Apple Inc. (... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 7 2020-02-28 [WAGNER SUSAN (CIK 0001059235), Apple Inc. (... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 8 2020-02-28 [LEVINSON ARTHUR D (CIK 0001214128), Apple In... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + # 9 2020-02-28 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... SEC Form 4 \n FORM 4UNITED STATES SECURITIES... + ``` + +* China + + ``` python + # Juchao + from finnlp.data_sources.company_announcement.juchao import Juchao_Announcement + + start_date = "2020-01-01" + end_date = "2020-06-01" + stock = "000001" + config = { + "use_proxy": "china_free", + "max_retry": 5, + "proxy_pages": 3, + } + + downloader = Juchao_Announcement(config) + downloader.download_date_range_stock(start_date, end_date, stock = stock, get_content = True, delate_pdf = True) + selected_columns = ["announcementTime", "shortTitle","Content"] + downloader.dataframe[selected_columns].head(10) + + -------------------- + + # announcementTime shortTitle Content + # 0 2020-05-27 关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告 证券代码: 000001 证券简称:平安银行 ... + # 1 2020-05-22 2019年年度权益分派实施公告 1 证券代码: 000001 证券简称:平安银行 ... + # 2 2020-05-20 关于获准发行小微企业贷款专项金融债券的公告 证券代码: 000001 证券简称:平安银行 ... + # 3 2020-05-16 监事会决议公告 1 证券代码: 000001 证券简称: 平安银行 ... + # 4 2020-05-15 2019年年度股东大会决议公告 1 证券代码: 000001 证券简称:平安银行 ... + # 5 2020-05-15 2019年年度股东大会的法律意见书 北京总部 电话 : (86 -10) 8519 -1300 传真 : (86 -10... + # 6 2020-04-30 中信证券股份有限公司、平安证券股份有限公司关于公司关联交易有关事项的核查意见 1 中信证券股份有限公司 、平安证券股份有限 公司 关于平安银行股份有限公司 关联交易 有... + # 7 2020-04-30 独立董事独立意见 1 平安银行股份有限公司独立董事独立意见 根据《关于在上市公司建立独立董事制度的指导... + # 8 2020-04-30 关联交易公告 1 证券代码: 000001 证券简称:平安银行 ... + # 9 2020-04-21 2020年第一季度报告全文 证券代码: 000001 证券简称:平安银行 ... + ``` + + +## Ⅱ. Data Sources + +### 1. News + +| Platform | Data Type | Related Market | Specified Company | Range Type | Limits | Support | +| :----------------------------------------------------------: | :--------: | :------------: | :----------------------------------------------------------: | :---------------: | :-------------------: | ------------------------------------------------------------ | +| Yahoo | Financial News | US Stocks | √ | Date Range | N/A | √ | +| Reuters | General News | US Stocks | × | Date Range | N/A | Soon | +| Seeking Alpha | Financial News | US Stocks | √ | Streaming | N/A | √ | +| Sina | Financial News | CN Stocks | × | Date Range | N/A | √ | +| Eastmoney | Financial News | CN Stocks | √ | Date Range | N/A | √ | +| Yicai | Financial News | CN Stocks | √ | Date Range | N/A | Soon | +| CCTV | General News | CN Stocks | × | Date Range | N/A | √ | +| US Mainstream Media | Financial News | US Stocks | √ | Date Range | Account (Free) | √ | +| CN Mainstream Media | Financial News | CN Stocks | × | Date Range | Account (¥500/year) | √ | + +### 2. Social Media + +| Platform | Data Type | Related Market | Specified Company | Range Type | Source Type | Limits | Support | +| :---------------------: | :-------: | :------------: | :---------------: | :--------: | :---------: | :-----: | :-----: | +| Twitter | Tweets | US Stocks | √ | Date Range | Official | N/A | √ | +| Twitter | Sentiment | US Stocks | √ | Date Range | Third Party | N/A | √ | +| StockTwits | Tweets | US Stocks | √ | Lastest | Official | N/A | √ | +| Reddit (wallstreetbets) | Threads | US Stocks | × | Lastest | Official | N/A | √ | +| Reddit | Sentiment | US Stocks | √ | Date Range | Third Party | N/A | √ | +| Weibo | Tweets | CN Stocks | √ | Date Range | Official | Cookies | √ | +| Weibo | Tweets | CN Stocks | √ | Lastest | Official | N/A | √ | + +### 3. Company Announcement +| Platform | Data Type | Related Market | Specified Company | Range Type | Source Type | Limits | Support | +| :-----------------------: | :-------: | :------------: | :---------------: | :--------: | :---------: | :----: | :-----: | +| Juchao (Official Website) | Text | CN Stocks | √ | Date Range | Official | N/A | √ | +| SEC (Official Website) | Text | US Stocks | √ | Date Range | Official | N/A | √ | +| Sina | Text | CN Stocks | √ | Lastest | Third Party | N/A | √ | + + +### 4. Data Sets + | Data Source | Type | Stocks | Dates | Available | + | :--------------: | :----: | :----: | :-------: | :--------------: | + | [AShare](https://github.com/JinanZou/Astock) | News | 3680 | 2018-07-01 to 2021-11-30 | √ | + | [stocknet-dataset](https://github.com/yumoxu/stocknet-dataset) | Tweets | 87 | 2014-01-02 to 2015-12-30 | √ | + | [CHRNN](https://github.com/wuhuizhe/CHRNN) | Tweets | 38 | 2017-01-03 to 2017-12-28 | √ | + +## Ⅲ. Large Language Models (LLMs) +* [ChatGPT (GPT 3.5)](https://openai.com/blog/chatgpt) +* [GPT 4.0](https://openai.com/research/gpt-4) +* [ChatGLM](https://github.com/THUDM/ChatGLM-6B) +* [PaLM](https://developers.googleblog.com/2023/03/announcing-palm-api-and-makersuite.html) +* [LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) +* [FinBERT](https://github.com/yya518/FinBERT) +* [Hugging Face](https://huggingface.co/) + +## LICENSE + +MIT License + +**Disclaimer: We are sharing codes for academic purposes under the MIT education license. Nothing herein is financial advice, and NOT a recommendation to trade real money. Please use common sense and always first consult a professional before trading or investing.** + diff --git a/FinNLP/demo/README.md b/FinNLP/demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9597b14f47e96f9141b67cadb96526a4c57af6c9 --- /dev/null +++ b/FinNLP/demo/README.md @@ -0,0 +1,15 @@ +## Demos: + +### Ⅰ. ChatGPT Tradings + +1. [Trade with ChatGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/tree/master/demo/chatgpt-trading-v1) + * Using the ChatGPT to give us trading suggestions. + * On [Ashare (News)](https://github.com/JinanZou/Astock) and A share Market ( `Maotai (贵州茅台 600519)` ) + ![image-20230220011335859](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202302200113884.png) +2. [Trade like ChatGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/tree/master/demo/chatgpt-trading-v2) + * Using ChatGPT's language model, GPT-3 to create an FinRL agent that trades as smartly as ChatGPT + * On [stocknet-dataset (Tweets)](https://github.com/yumoxu/stocknet-dataset) and US Stocks Market (`AAPL`) + ![image-20230216004801458](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202302181558796.png) +### Ⅱ. Sentiment Classify + +1. [Shares News Sentiment Classify.](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech/blob/master/demo/shares_news_sentiment_classify.py) \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/docs/index.md b/FinNLP/docs/FinNLP/docs/index.md new file mode 100644 index 0000000000000000000000000000000000000000..c7303fbbacc9445420b980eb8a732c59398d23bc --- /dev/null +++ b/FinNLP/docs/FinNLP/docs/index.md @@ -0,0 +1,128 @@ +# LLMs in financial world and Internet-scale Financial Data + +The demos are shown in [FinGPT](https://github.com/AI4Finance-Foundation/FinGPT) and the data sources and supporting codes are in [FinNLP](https://github.com/AI4Finance-Foundation/FinNLP) + +中文版请点击[这里](./zh/index.md) + +**Disclaimer: We are sharing codes for academic purpose under the MIT education license. Nothing herein is financial advice, and NOT a recommendation to trade real money. Please use common sense and always first consult a professional before trading or investing.** + +## Ⅰ. Architecture + +![image-20230505200244043](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052002139.png) + +* The whole project is made up of 4 parts: + + * The first part is the **Data Source**, Here, we **gather past and streaming data** from the Internet. + + * Next, we push the data to the **Data Engineering** part where we **clean the data, tokenize the data and do the prompt engineering** + + * Then, the data is pushed to **LLMs**. Here, we may use LLMs in different kind of ways. We can not only use the collected data to train our own **light-weight fine-tuning models** but we can also use those data and **trained models** or **LLM APIs** to support our applications + * The last part would be the **application** part, here we can use data and LLMs to make many interesting applications. + +## Ⅱ. Data Sources + +![image-20230505200446477](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052004539.png) + +* Due to space limitations, we only show a few of them. + +### 1. [News](jupyter/Data_Sources_News.ipynb) + +| Platform | Data Type | Related Market | Specified Company | Range Type | Source Type | Limits | Docs (1e4) | Support | +| :----------------------------------------------------------: | :--------: | :------------: | :----------------------------------------------------------: | :---------------: | :--------: | :-------------------: | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Yahoo | Financial News | US Stocks | √ | Date Range | Official | N/A | 1,500+ | √ | +| Reuters | Financial News | US Stocks | × | Date Range | Official | N/A | 1,500+ | √ | +| Sina | Financial News | CN Stocks | × | Date Range | Official | N/A | 2,000+ | √ | +| Eastmoney | Financial News | CN Stocks | √ | Date Range | Official | N/A | 1,000+ | √ | +| Yicai | Financial News | CN Stocks | √ | Date Range | Official | N/A | 500+ | Soon | +| CCTV | Governemnt News | CN Stocks | × | Date Range | Third party | N/A | 4 | √ | +| US Mainstream | Financial News | US Stocks | √ | Date Range | Third party | Account (Free) | 3,200+ | √ | +| CN Mainstream | Financial News | CN Stocks | × | Date Range | Third party | ¥500/year | 3000+ | √ | + +* FinGPT may have **fewer docs** than Bloomberg, we're on the **same order of magnitude.** + +### 2. [Social Media](jupyter/Data_Sources_Social_Media.iypnb) + +| Platform | Data Type | Related Market | Specified Company | Range Type | Source Type | Limits | Docs (1e4) | Support | +| :---------------------: | :-------: | :------------: | :---------------: | :--------: | :---------: | :-----: | ---------- | :-----: | +| Twitter | Tweets | US Stocks | √ | Date Range | Official | N/A | 18,000+ | √ | +| StockTwits | Tweets | US Stocks | √ | Lastest | Official | N/A | 160,000+ | √ | +| Reddit (wallstreetbets) | Threads | US Stocks | × | Lastest | Official | N/A | 9+ | √ | +| Weibo | Tweets | CN Stocks | √ | Date Range | Official | Cookies | 1,400,000+ | √ | +| Weibo | Tweets | CN Stocks | √ | Lastest | Official | N/A | 1,400,000+ | √ | + +* In **BloomberGPT**, they **don’t collect social media data**, but we believe that **public opinion is one of the most important factors interfering the stock market.** + +### 3. [Company Announcement](jupyter/Data_Sources_Company_Announcement.ipynb) + +| Platform | Data Type | Related Market | Specified Company | Range Type | Source Type | Limits | Docs (1e4) | Support | +| :-----------------------: | :-------: | :------------: | :---------------: | :--------: | :---------: | :----: | ---------- | :-----: | +| Juchao (Official Website) | Text | CN Stocks | √ | Date Range | Official | N/A | 2,790+ | √ | +| SEC (Official Website) | Text | US Stocks | √ | Date Range | Official | N/A | 1,440+ | √ | + +* Since we collect data from different stock markets, we have **more filing docs** than Bloomberg GPT. + +### 4. Trends + +| Platform | Data Type | Related Market | Data Source | Specified Company | Range Type | Source Type | Limits | +| :-------------------------------------------------------: | :-------: | :------------: | :-----------------------------------------------------: | :---------------: | :--------: | :---------: | :----: | +| [Google Trends](https://trends.google.com/trends/explore) | Index | US Stocks | [Google Trends](./finnlp/data_sources/trends/google.py) | √ | Date Range | Official | N/A | +| [Baidu Index](https://index.baidu.com/v2/index.html#/) | Index | CN Stocks | Soon | - | - | - | - | + + +### 5. Data Sets + | Data Source | Type | Stocks | Dates | Avaliable | + | :--------------: | :----: | :----: | :-------: | :--------------: | + | [AShare](https://github.com/JinanZou/Astock) | News | 3680 | 2018-07-01 to 2021-11-30 | √ | + | [stocknet-dataset](https://github.com/yumoxu/stocknet-dataset) | Tweets | 87 | 2014-01-02 to 2015-12-30 | √ | + | [CHRNN](https://github.com/wuhuizhe/CHRNN) | Tweets | 38 | 2017-01-03 to 2017-12-28 | √ | + +## Ⅲ. Models + +![image-20230505200618504](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052006541.png) + +* In data-centric NLP, we don’t train the model from the beginning. We only **call APIs** and **do light-weight fine-tunings.** +* The left part is some LLM APIs that we may use and the middle part is the models that we may use to perform fine-tunings and the right part is some of the **Fine-tuning methods** + +### 1. Fine-tuning: Tensor Layers (LoRA) + +![image-20230505200944411](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052009480.png) + +* In FinGPT, we fine-tune a pre-trained LLM using a new financial dataset.**High-quality labeled data** is one of the most **important key** to many successful LLMs including ChatGPT +* However, those high-quality labeled data are often very **expensive and time-consuming** and we may need help from professional finance experts. +* If our goal is to use LLMs to analyze financial-related text data and help with quantitative trading, why not **let the market do the labeling** for us? +* So here, we use the related stock price change percent of each news as the output label, we use the threshold to split the label into three groups **positive, negative, and neutral,** and use them and the **label of the news sentiment**. +* In correspondence, we also ask the model to select one of positive, negative, and neutral as the output in the **prompt engineer** part so we the make the best use of the pre-trained information +* By using LoRA we may reduced the trainable parameters **from 6.17B to 3.67M** +* As the table presents, compared with chatGLM, FinGPT can achieve large improvement on multiple metrics. it may be **inappropriate** to **use our model to quantitative trading directly.** Since most **news titles are neutral**, most of the **original outputs of the LLMs are Neutral**, so LLM **perform poorly in positive and negative labels** and **those** **labels** are what might be **useful in quantitative trading.** +* However, **after fine-tuning**, we have witness **huge improvements in the prediction of** **positive and negative labels.** +* That’s also **why the model can achieve positive trading results**. + +### 2. Fine-tuning: Reinforcement Learning on Stock Prices (RLSP) + +![image-20230505201209946](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052012996.png) + +* In the same way, we may use RL on Stock Prices (RLSP) to replace RL on Human feedback used by ChatGPT. + +## Ⅳ. Applications + +### 1. Robo Advisor + +![image-20230505201913233](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052019296.png) + +* **ChatGPT can make the investment advises just like a pro**. +* In this example the **raising stock price** of the Apple is **in accordance with** ChatGPT’s **prediction made by the analysis of news** + +### 2. Quantitative Trading + +![image-20230505201841001](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052018035.png) + +* We may also use News, Social media tweet or filing to **build sentiment factors**, the right part is the trading results just by the signal of the twitter tweets and ChatGPT, the data is from a data set called [stocknet-dataset](https://link.zhihu.com/?target=https%3A//github.com/yumoxu/stocknet-dataset). +* As you may see from the picture, the trading signals generated by ChatGPT are **so good** that we may **even achieve good results just by trading according to twitter sentiment factors.** +* So we may even **achieve better results by combining price factors**. + +### 3. Low-code development + +![image-20230505202028292](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052020363.png) + +* We can use the help of LLMs to write codes. +* The right part shows how we can develop our factors and other codes **quickly and efficiently.** \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_Company_Announcement.ipynb b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_Company_Announcement.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..505f2c3b2048c317a0410fd7eda69eb8f42a6606 --- /dev/null +++ b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_Company_Announcement.ipynb @@ -0,0 +1,783 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../FinNLP\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SEC" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.company_announcement.sec import SEC_Announcement" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2020-01-01\"\n", + "end_date = \"2020-06-01\"\n", + "stock = \"AAPL\"\n", + "config = {\n", + " \"use_proxy\": \"us_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 3,\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Checking ips: 100%|██████████| 45/45 [01:42<00:00, 2.28s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get proxy ips: 45.\n", + "Usable proxy ips: 44.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading by item...: 100%|██████████| 39/39 [01:39<00:00, 2.54s/it]\n" + ] + } + ], + "source": [ + "downloader = SEC_Announcement(config)\n", + "downloader.download_date_range_stock(start_date, end_date, stock = stock)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idciksperiod_endingroot_formfile_numdisplay_namesxslsequencefile_datebiz_statessicsformadshfilm_numbiz_locationsfile_typefile_descriptioninc_statesitecontent
00000320193-20-000056:wf-form4_158932261319105.xml[0001631982, 0000320193]2020-05-084[][KONDO CHRIS (CIK 0001631982), Apple Inc. (A...xslF345X0312020-05-12[][3571]40000320193-20-000056[][, ]4FORM 4[, CA, ][]SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
10000320193-20-000054:wf-form4_158829658358801.xml[0001051401, 0000320193]2020-04-284[001-36743][JUNG ANDREA (CIK 0001051401), Apple Inc. (A...xslF345X0312020-04-30[CA][3571]40000320193-20-000054[20838087][, Cupertino, CA]4FORM 4[, CA][]SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
\n", + "
" + ], + "text/plain": [ + " _id \\\n", + "0 0000320193-20-000056:wf-form4_158932261319105.xml \n", + "1 0000320193-20-000054:wf-form4_158829658358801.xml \n", + "\n", + " ciks period_ending root_form file_num \\\n", + "0 [0001631982, 0000320193] 2020-05-08 4 [] \n", + "1 [0001051401, 0000320193] 2020-04-28 4 [001-36743] \n", + "\n", + " display_names xsl sequence \\\n", + "0 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... xslF345X03 1 \n", + "1 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... xslF345X03 1 \n", + "\n", + " file_date biz_states sics form adsh film_num \\\n", + "0 2020-05-12 [] [3571] 4 0000320193-20-000056 [] \n", + "1 2020-04-30 [CA] [3571] 4 0000320193-20-000054 [20838087] \n", + "\n", + " biz_locations file_type file_description inc_states ite \\\n", + "0 [, ] 4 FORM 4 [, CA, ] [] \n", + "1 [, Cupertino, CA] 4 FORM 4 [, CA] [] \n", + "\n", + " content \n", + "0 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "1 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "# df = df.drop_duplicates()\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21, 20)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
file_datedisplay_namescontent
02020-05-12[KONDO CHRIS (CIK 0001631982), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
12020-04-30[JUNG ANDREA (CIK 0001051401), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
22020-04-17[O'BRIEN DEIRDRE (CIK 0001767094), Apple Inc....SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
32020-04-17[KONDO CHRIS (CIK 0001631982), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
42020-04-09[Maestri Luca (CIK 0001513362), Apple Inc. (...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
52020-04-03[WILLIAMS JEFFREY E (CIK 0001496686), Apple I...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
62020-04-03[Maestri Luca (CIK 0001513362), Apple Inc. (...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
72020-02-28[WAGNER SUSAN (CIK 0001059235), Apple Inc. (...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
82020-02-28[LEVINSON ARTHUR D (CIK 0001214128), Apple In...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
92020-02-28[JUNG ANDREA (CIK 0001051401), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
\n", + "
" + ], + "text/plain": [ + " file_date display_names \\\n", + "0 2020-05-12 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... \n", + "1 2020-04-30 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... \n", + "2 2020-04-17 [O'BRIEN DEIRDRE (CIK 0001767094), Apple Inc.... \n", + "3 2020-04-17 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... \n", + "4 2020-04-09 [Maestri Luca (CIK 0001513362), Apple Inc. (... \n", + "5 2020-04-03 [WILLIAMS JEFFREY E (CIK 0001496686), Apple I... \n", + "6 2020-04-03 [Maestri Luca (CIK 0001513362), Apple Inc. (... \n", + "7 2020-02-28 [WAGNER SUSAN (CIK 0001059235), Apple Inc. (... \n", + "8 2020-02-28 [LEVINSON ARTHUR D (CIK 0001214128), Apple In... \n", + "9 2020-02-28 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... \n", + "\n", + " content \n", + "0 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "1 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "2 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "3 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "4 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "5 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "6 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "7 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "8 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "9 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"file_date\", \"display_names\", \"content\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Juchao" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.company_announcement.juchao import Juchao_Announcement" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2020-01-01\"\n", + "end_date = \"2020-06-01\"\n", + "stock = \"000001\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 3,\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 3/3 [00:05<00:00, 1.86s/it]\n", + "Checking ips: 100%|██████████| 45/45 [00:48<00:00, 1.09s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 45 。Get proxy ips: 45.\n", + "能用的代理数量: 6。Usable proxy ips: 6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1bb13261e75147929b30222347ab9cc5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading by page...: 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsecCodesecNameorgIdannouncementIdannouncementTitleannouncementTimeadjunctUrladjunctSizeadjunctType...importantbatchNumannouncementContentorgNametileSecNameshortTitleannouncementTypeNamesecNameListPDF_pathContent
0None000001平安银行gssz00000011207862647关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告2020-05-27finalpage/2020-05-27/1207862647.PDF148PDF...NoneNoneNone平安银行关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告NoneNoneremoved证券代码: 000001 证券简称:平安银行 ...
1None000001平安银行gssz000000112078436882019年年度权益分派实施公告2020-05-22finalpage/2020-05-22/1207843688.PDF214PDF...NoneNoneNone平安银行2019年年度权益分派实施公告NoneNoneremoved1 证券代码: 000001 证券简称:平安银行 ...
\n", + "

2 rows × 25 columns

\n", + "" + ], + "text/plain": [ + " id secCode secName orgId announcementId \\\n", + "0 None 000001 平安银行 gssz0000001 1207862647 \n", + "1 None 000001 平安银行 gssz0000001 1207843688 \n", + "\n", + " announcementTitle announcementTime \\\n", + "0 关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告 2020-05-27 \n", + "1 2019年年度权益分派实施公告 2020-05-22 \n", + "\n", + " adjunctUrl adjunctSize adjunctType ... \\\n", + "0 finalpage/2020-05-27/1207862647.PDF 148 PDF ... \n", + "1 finalpage/2020-05-22/1207843688.PDF 214 PDF ... \n", + "\n", + " important batchNum announcementContent orgName tileSecName \\\n", + "0 None None None 平安银行 \n", + "1 None None None 平安银行 \n", + "\n", + " shortTitle announcementTypeName secNameList PDF_path \\\n", + "0 关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告 None None removed \n", + "1 2019年年度权益分派实施公告 None None removed \n", + "\n", + " Content \n", + "0 证券代码: 000001 证券简称:平安银行 ... \n", + "1 1 证券代码: 000001 证券简称:平安银行 ... \n", + "\n", + "[2 rows x 25 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(42, 25)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
announcementTimeshortTitleContent
02020-05-27关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告证券代码: 000001 证券简称:平安银行 ...
12020-05-222019年年度权益分派实施公告1 证券代码: 000001 证券简称:平安银行 ...
22020-05-20关于获准发行小微企业贷款专项金融债券的公告证券代码: 000001 证券简称:平安银行 ...
32020-05-16监事会决议公告1 证券代码: 000001 证券简称: 平安银行 ...
42020-05-152019年年度股东大会决议公告1 证券代码: 000001 证券简称:平安银行 ...
52020-05-152019年年度股东大会的法律意见书北京总部 电话 : (86 -10) 8519 -1300 传真 : (86 -10...
62020-04-30中信证券股份有限公司、平安证券股份有限公司关于公司关联交易有关事项的核查意见1 中信证券股份有限公司 、平安证券股份有限 公司 关于平安银行股份有限公司 关联交易 有...
72020-04-30独立董事独立意见1 平安银行股份有限公司独立董事独立意见 根据《关于在上市公司建立独立董事制度的指导...
82020-04-30关联交易公告1 证券代码: 000001 证券简称:平安银行 ...
92020-04-212020年第一季度报告全文证券代码: 000001 证券简称:平安银行 ...
\n", + "
" + ], + "text/plain": [ + " announcementTime shortTitle \\\n", + "0 2020-05-27 关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告 \n", + "1 2020-05-22 2019年年度权益分派实施公告 \n", + "2 2020-05-20 关于获准发行小微企业贷款专项金融债券的公告 \n", + "3 2020-05-16 监事会决议公告 \n", + "4 2020-05-15 2019年年度股东大会决议公告 \n", + "5 2020-05-15 2019年年度股东大会的法律意见书 \n", + "6 2020-04-30 中信证券股份有限公司、平安证券股份有限公司关于公司关联交易有关事项的核查意见 \n", + "7 2020-04-30 独立董事独立意见 \n", + "8 2020-04-30 关联交易公告 \n", + "9 2020-04-21 2020年第一季度报告全文 \n", + "\n", + " Content \n", + "0 证券代码: 000001 证券简称:平安银行 ... \n", + "1 1 证券代码: 000001 证券简称:平安银行 ... \n", + "2 证券代码: 000001 证券简称:平安银行 ... \n", + "3 1 证券代码: 000001 证券简称: 平安银行 ... \n", + "4 1 证券代码: 000001 证券简称:平安银行 ... \n", + "5 北京总部 电话 : (86 -10) 8519 -1300 传真 : (86 -10... \n", + "6 1 中信证券股份有限公司 、平安证券股份有限 公司 关于平安银行股份有限公司 关联交易 有... \n", + "7 1 平安银行股份有限公司独立董事独立意见 根据《关于在上市公司建立独立董事制度的指导... \n", + "8 1 证券代码: 000001 证券简称:平安银行 ... \n", + "9 证券代码: 000001 证券简称:平安银行 ... " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"announcementTime\", \"shortTitle\",\"Content\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "afd6dc03c9be451573fc2885de79a969af6a24a159f11a3ead741ab7a9ff405f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_EarningCalls.ipynb b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_EarningCalls.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..229b8de3e03e6b300c4d202bb85701f4dcc11001 --- /dev/null +++ b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_EarningCalls.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.chdir('../../../..')\n", + "# print(os.getcwd())" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.earning_calls import EarningCallTranscripts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Earning call transcripts takes in three arguments\n", + "\n", + "* Year\n", + "* Ticker symbol\n", + "* Quarter name from the list [\"Q1\",\"Q2\",\"Q3\",\"Q4\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "loader = EarningCallTranscripts(2023,'AAPL','Q3')\n", + "docs = loader.load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'text': \"Operator: Good day, and welcome to the Apple Q3 Fiscal Year 2023 Earnings Conference Call. Today's call is being recorded. At this time, for opening remarks and introductions, I would like to turn the call over to Saori Casey, Vice President of Finance. Please go ahead.\\nSaori Casey: Thank you. Good afternoon, and thank you for joining us. Speaking first today is Apple's CEO, Tim Cook; and he'll be followed by CFO, Luca Maestri. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including, without limitation, those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation and future business outlook, including the potential impact of macroeconomic conditions on the company's business and the results of operations. These statements involve risks and uncertainties that may cause actual results or trends to differ materially from our forecast. For more information, please refer to the risk factors discussed in Apple's most recently filed annual report on Form 10-K and the Form 8-K filed with the SEC today, along with the associated press release. Apple assumes no obligation to update any forward-looking statements, which speak only as of the date they are made. I'd now like to turn the call over to Tim for introductory remarks.\\nTim Cook: Thank you, Saori. Good afternoon, everyone, and thanks for joining us. Today, Apple is reporting revenue of $81.8 billion for the June quarter, better than our expectations. We continued to see strong results in emerging markets, driven by robust sales of iPhone with June quarter total revenue records in India, Indonesia, Mexico, the Philippines, Poland, Saudi Arabia, Turkey and the UAE. We set June quarter records in a number of other countries as well, including France, the Netherlands and Austria. And we set an all-time revenue record in Services driven by more than $1 billion paid subscriptions. We continued to face an uneven macroeconomic environment, including nearly 4 percentage points of foreign exchange headwinds. On a constant currency basis, we grew compared to the prior year's quarter in aggregate and in the majority of markets we track. We continue to manage deliberately and innovate relentlessly, and we are driven by the sense of possibility those efforts inspire. To that end, before I turn to the quarter in more detail, I want to take a moment to acknowledge the unprecedented innovations we were proud to announce at our Worldwide Developers Conference. In addition to extraordinary new Macs and incredible updates to our software platforms, we had the chance to introduce the world to spatial computing. We were so pleased to share the revolutionary Apple Vision Pro with the world, a bold new product unlike anything else created before. Apple Vision Pro is a marvel of engineering, built on decades of innovation only possible at Apple. It is the most advanced personal electronic device ever created, and we've been thrilled by the reaction from press, analysts, developers and content creators who've had the chance to try it. We can't wait to get it into customers' hands early next year. Now let me share more with you on our June quarter results beginning with iPhone. iPhone revenue came in at $39.7 billion for the quarter, down 2% from the year ago quarter's record performance. On a constant currency basis, iPhone revenue grew, and we had a June quarter record for switchers, reflecting the popularity of the iPhone lineup. iPhone 14 customers continue to praise the exceptional battery life and essential health and safety features, while iPhone 14 Plus users are loving the new larger screen size. And with Dynamic Island, Always-On display and the most powerful camera system ever in an iPhone, the iPhone 14 Pro lineup is our best ever. Turning to Mac. We recorded $6.8 billion in revenue, down 7% year-over-year. We are proud to have completed the transition of our entire Mac lineup to run exclusively on Apple silicon. We are also excited to have introduced the new 15-inch MacBook Air during the quarter, the world's best 15-inch laptop and one of the best Macs we've ever made. And we launched 2 new powerhouses in computing, Mac Studio with M2 Max and M2 Ultra and Mac Pro with M2 Ultra, which are the most powerful Macs we've ever made. iPad revenue was $5.8 billion for the June quarter, down 20% year-over-year, in part due to a difficult compare because of the timing of the iPad Air launch last year. Customers are loving iPad's versatility and exceptional value. There was a great deal of excitement from creatives when we brought Final Cut Pro and Logic Pro to iPad this spring. And with the back-to-school season in full swing, iPad has the power to help students tackle the toughest assignments. Across Wearables, Home and Accessories, revenue was $8.3 billion, up 2% year-over-year and in line with our expectations. Packed with features to empower users to live a healthier life, Apple Watch and Apple Watch Ultra continue to help people take the next step on their wellness journey. As I mentioned earlier, last quarter, we held our biggest and most exciting WWDC yet. We were thrilled to welcome developers from across the globe to Apple Park, both in person and virtually, and to share some stunning new announcements with the world. In addition to Apple Vision Pro and the new Macs that we introduced, we had the chance to reveal some truly remarkable new innovations to our software platforms. From exciting new features like Live Voicemail and StandBy in iOS 17, to new tools for users to work, play and personalize their experience in macOS Sonoma and iPadOS 17, to a fresh design and new workout capabilities in watchOS 10, there's so much coming later this year to empower users to get more out of their devices, and we think they're going to instantly love these new features. It was also an exciting quarter for Services where revenue reached $21.2 billion and saw a sequential acceleration to an 8% year-over-year increase, better than we expected. We set an all-time revenue record for total services and in a number of categories, including video, AppleCare, cloud and payment services. Since we introduced Apple Pay almost a decade ago, customers have been loving how easy it is to make purchases online, in apps and in stores. We're also pleased to see Apple Card build on the success of Apple Pay. Designed with our users' financial health in mind, Apple Card has become one of the most successful credit card programs in the U.S. with award-winning customer satisfaction. And this spring, we introduced a new high-yield savings account for Apple Card customers, which has become incredibly popular, with customers already making more than $10 billion in deposits. Meanwhile, Apple TV+ continues to provide a spectacular showcase of imaginative storytelling. Recently, fans welcomed new series like Hijack and Silo as well as returning fan favorites like Foundation and The Afterparty. In the few years since its launch, Apple TV+ has earned more than 1,500 nominations and 370 wins. That includes the 54 Emmy Award nominations across 13 titles that Apple TV+ received last month. It's also been an exciting time for sports on Apple TV+. Soccer legend Lionel Messi made his debut with Major League Soccer last month, and fans all over the world tuned in with MLS Season Pass. We are excited about our MLS partnership, and we're thrilled to see Messi suiting up with Inter Miami. And just in time for summer concert season, Apple Music launched new discovery features celebrating live music, including venue guides in Apple Maps and set lists from tours of major artists. These new features and others join a lineup of updates coming later this year to make Services more powerful, more useful and more fun than ever. Everything we do is in service of our customers, and retail is where we bring the best of Apple. During the quarter, we opened the Apple Store online in Vietnam, and we're excited to connect with more customers there. We also redesigned our first-ever Apple Store located in Tysons Corner, Northern Virginia, with inclusive, innovative and sustainable design enhancements. We opened a beautiful new store beneath our new London headquarters in the historic Battersea Power Station. And the performance of the stores we opened in India this spring exceeded our initial expectations. With every product we create, every feature we develop and every interaction we share with our customers, we lead with the values we stand for. We believe in creating technology that serves all of humanity, which is why accessibility has always been a core value that we embed in everything we do. On Global Accessibility Awareness Day, we unveiled some extraordinary new tools for cognitive, vision, hearing and mobile accessibility that will be available later this year, including Assistive Access, which distills apps to their most essential features, and Personal Voice, which allows users to create a synthesized voice that sounds just like them. Building technology and service of our customers also means protecting their privacy, which we believe is a fundamental human right. That's why we were pleased to announce major updates to Safari Private Browsing, Communication Safety and Lockdown Mode to further safeguard our users. And as part of our efforts to build a better world, we announced that we've more than doubled our initial commitment to our Racial Equity and Justice Initiative to more than $200 million. We will continue to do our part to support education, economic empowerment and criminal justice reform work. And while supporting efforts to advance equity and opportunity, we continue to build a culture of belonging at Apple and a workforce that reflects the communities we serve. Through our environmental work, we're making strides in our commitment to leave the world better than we found it. Last month, Apple joined with global nonprofit Acumen in a new effort to improve livelihoods in India through clean energy innovation, and we are as committed as ever to our Apple 2030 goal to be carbon neutral across our entire supply chain and the life cycle of our products. We've long held that education is the great equalizer. With that in mind, we're expanding Apple Learning Coach, a free professional learning program that teaches educators how to get more out of Apple technology in the classroom. Today, we welcome more than 1,900 educators across the U.S. to the program. By the end of the year, we'll offer Apple Learning Coach in 12 more countries. As we're connecting with teachers, we're also celebrating the graduations of students at our app developer academies around the world. From Detroit, to Naples, to Riyadh and more, we're excited to watch these talented developers embark on careers in coding and find ways to make a positive difference in their communities. Apple remains a champion of innovation, a company fueled by boundless creativity, driven by a deep sense of mission and guided by the unshakable belief that a great idea can change the world. Looking ahead, we'll continue to manage for the long term, always pushing the limits of what's possible and always putting the customer at the center of everything we do. With that, I'll turn it over to Luca.\\nLuca Maestri: Thank you, Tim, and good afternoon, everyone. Revenue for the June quarter was $81.8 billion, down 1% from last year and better than our expectations despite nearly 4 percentage points of negative impact from foreign exchange. On a constant currency basis, our revenue grew year-over-year in total and in the majority of the markets we track. We set June quarter records in both Europe and Greater China and continue to see strong performance across our emerging markets driven by iPhone. Products revenue was $60.6 billion, down 4% from last year, as we faced FX headwinds and an uneven macroeconomic environment. However, our installed base reached an all-time high across all geographic segments, driven by a June quarter record for iPhone switchers and high new-to rates in Mac, iPad and Watch, coupled with very high levels of customer satisfaction and loyalty. Our Services revenue set an all-time record of $21.2 billion, up 8% year-over-year and grew double digits in constant currency. Our performance was strong around the world as we reach all-time Services revenue records in Americas and Europe and June quarter records in Greater China and rest of Asia Pacific. Company gross margin was 44.5%, a record level for the June quarter and up 20 basis points sequentially, driven by cost savings and favorable mix shift towards Services, partially offset by a seasonal loss of leverage. Products gross margin was 35.4%, down 130 basis points from last quarter due to seasonal loss of leverage and mix, partially offset by favorable costs. Services gross margin was 70.5%, decreasing 50 basis points sequentially. Operating expenses of $13.4 billion were below the low end of the guidance range we provided at the beginning of the quarter and decelerated from the March quarter. We continue to take a deliberate approach in managing our spend with strong focus on innovation and new product development. The results of these actions delivered net income of $19.9 billion, diluted earnings per share of $1.26, up 5% versus last year, and very strong operating cash flow of $26.4 billion. Let me now provide more detail for each of our revenue categories. iPhone revenue was $39.7 billion, down 2% year-over-year but grew on a constant currency basis. We set revenue records in several markets around the world, including an all-time record in India and June quarter records in Latin America, the Middle East and Africa, Indonesia, the Philippines, Italy, the Netherlands and the U.K. Our iPhone active installed base grew to a new all-time high, thanks to a June quarter record in switchers. This is a testament to our extremely high levels of customer satisfaction, which 451 Research recently measured at 98% for the iPhone 14 family in the U.S. Mac generated $6.8 billion in revenue, down 7% year-over-year. We continue to invest in our Mac portfolio. And this past quarter, we were pleased to complete the transition to Apple silicon for the entire lineup. This transition has driven both strong upgrade activity and a high number of new customers. In fact, almost half of Mac buyers during the quarter were new to the product. We also saw reported customer satisfaction of 96% for Mac in the U.S. iPad revenue was $5.8 billion, down 20% year-over-year and in line with our expectations. These results were driven by a difficult compare against the full quarter impact of the iPad Air launch in the prior year. At the same time, we continue to attract a large number of new customers to the iPad installed base with over half of the customers who purchased iPads during the quarter being new to the product. And the latest reports from 451 Research indicate customer satisfaction of 96% in the U.S. Wearables, Home and Accessories revenue was $8.3 billion, up 2% year-over-year, with a June quarter record in Greater China and strong performance in several emerging markets. We continue to see Apple Watch expand its reach with about 2/3 of customers purchasing an Apple Watch during the quarter being new to the product. And this is combined with very high levels of customer satisfaction, which was recently reported at 98% in the United States. Moving on to Services. We reached a new all-time revenue record of $21.2 billion with year-over-year growth accelerating sequentially to 8% and up double digits in constant currency. In addition to the all-time records Tim mentioned earlier, we also set June quarter records for advertising, App Store and Music. We are very pleased with our performance in Services, which is a direct reflection of our ecosystem's strength. First, our installed base of over 2 billion active devices continues to grow at a nice pace and establishes a solid foundation for the future expansion of our ecosystem. Second, we see increased customer engagement with our services. Both our transacting accounts and paid accounts grew double digits year-over-year, each reaching a new all-time high. Third, our paid subscriptions showed strong growth. This past quarter, we reached an important milestone and passed 1 billion paid subscriptions across the services on our platform, up 150 million during the last 12 months and nearly double the number of paid subscriptions we had only 3 years ago. And finally, we continue to improve the breadth and the quality of our current services. From 20 new games on Apple Arcade, to brand-new content on Apple TV+, to the launch of our high-yield savings account with Apple Card, our customers are loving these enhanced offerings. Turning to the enterprise market. Our customers are leveraging Apple products every day to help improve productivity and attract talent. Blackstone, a global investment management firm, is expanding its Apple footprint from their corporate iPhone fleet to now offering the MacBook Air powered by M2 to all of their corporate employees and portfolio companies. Gilead, a leading biopharmaceutical company, has deployed thousands of iPads globally to their sales team. Over the last 6 months, they have also doubled their Mac user base by making MacBook Air available to more employees with a focus on user experience and strong security. Let me now turn to our cash position and capital return program. We ended the quarter with over $166 billion in cash and marketable securities. We repaid $7.5 billion in maturing debt while issuing $5.2 billion of new debt and increasing commercial paper by $2 billion, leaving us with total debt of $109 billion. As a result, net cash was $57 billion at the end of the quarter. During the quarter, we returned over $24 billion to shareholders, including $3.8 billion in dividends and equivalents and $18 billion through open market repurchases of 103 million Apple shares. We continue to believe there is great value in our stock and maintain our target of reaching a net cash neutral position over time. As we move ahead into the September quarter, I'd like to review our outlook, which includes the types of forward-looking information that Saori referred to at the beginning of the call. We expect our September quarter year-over-year revenue performance to be similar to the June quarter, assuming that the macroeconomic outlook doesn't worsen from what we are projecting today for the current quarter. Foreign exchange will continue to be a headwind, and we expect a negative year-over-year revenue impact of over 2 percentage points. We expect iPhone and Services year-over-year performance to accelerate from the June quarter. Also, we expect the revenue for both Mac and iPad to decline by double digits year-over-year due to difficult compares, particularly on the Mac. For both products, we experienced supply disruptions from factory shutdowns in the June quarter a year ago and were able to fulfill significant pent-up demand in the year ago September quarter. We expect gross margin to be between 44% and 45%. We expect OpEx to be between $13.5 billion and $13.7 billion. We expect OI&E to be around negative $250 million, excluding any potential impact from the mark-to-market of minority investments, and our tax rate to be around 16%. Finally, today, our Board of Directors has declared a cash dividend of $0.24 per share of common stock payable on August 17, 2023, to shareholders of record as of August 14, 2023. With that, let's open the call to questions.\\nSaori Casey: Thank you, Luca. [Operator Instructions]. Operator, may we have the first question, please?\\nOperator: [Operator Instructions]. We will go ahead and take our first question from Shannon Cross with Credit Suisse.\\nShannon Cross: Tim, you mentioned -- and actually, Luca, too, you mentioned an uneven macro environment during the quarter several times on the call. I'm wondering if you can talk on a geographic basis about some of the trends you're seeing in iPhone. I'm specifically wondering how demand is trending within...\\nLuca Maestri: Sure. Shannon, I'll answer it. I didn't get the end of your question.\\nOperator: I think she has dropped.\\nLuca Maestri: Okay. Well, let me answer the question for the part that I could follow. So on a geographic basis, we've had great performance for iPhone in emerging markets. We set June quarter records in many of the emerging markets. We grew in total double digits. And the performance was strong across the board in emerging markets from China, where our performance improved from minus 3% to plus 8% in the June quarter and we grew double digits in constant currency, to many other areas around the world from India, where, again, we set a June quarter record with very strong performance there, Indonesia, Southeast Asia, in general, Latin America, Middle East. And so it's been really good there. We -- also, as you can see from our geographic segments, we had a slight acceleration of performance in the Americas, primarily in the United States, but we declined there because the smartphone market has been in a decline for the last couple of quarters in the United States.\\nShannon Cross: Sorry about that. I'm not sure why I cut off. In terms of gross margin, you were at the high end of the range [Technical Difficulty] and you guided to 45% at the high end, which is, I think, higher than I remember in 20 years of covering you. So how should we think about puts and takes of gross margin? And it seems like there's like a perfect storm of good things. So I just -- maybe if you can talk about how you're thinking about it more holistically.\\nLuca Maestri: I think you remember correctly, Shannon, because the 44.5% for the June quarter is an all-time record for us in June. We were up 20 basis points sequentially. It was driven by cost savings and a mix shift towards Services, which obviously helps company gross margins, partially offset by the seasonal loss of leverage. We have a commodity environment that is favorable to us. Our product mix is quite strong at this point. And so with the exception of foreign exchange, which continues to be a drag, and it was a significant drag on a year-over-year basis, yes, we are in a good position right now. We are in a good position for the June quarter. And as I mentioned, we expect similar level of gross margins for the same reasons, frankly, for the September quarter.\\nOperator: Our next question comes from Wamsi Mohan of Bank of America.\\nWamsi Mohan: Luca, can you just give us a little more color around the guidance? Your overall revenue performance, you called out similar. Obviously, you absorbed a higher FX impact this quarter versus your guide. And you also noted Services acceleration. So just wondering, when you think about that comment on iPhone acceleration, is that on a reported basis? Is that constant currency basis? And is there something that's changing in terms of seasonality perhaps for you that is causing not as much step-up in product revenue as typical on a sequential basis? And I have a follow-up.\\nLuca Maestri: Yes. So all our comments are in reported currency, not in constant currency in relation to the outlook. And we said acceleration sequentially for iPhone and for Services. But we're also pointing out -- and this is where I think, Wamsi, you're referring to some seasonality issues. We also said that for Mac and iPad, we expect to decline double digits. And the reason for that is that we have a very difficult compare versus last year. You remember that a year ago, in the June quarter, we had factory shutdowns for both Mac and iPad. And so we were able to fill the pent-up demand from those shutdowns during the September quarter. So an unusual level of activity that we had a year ago. And so now, obviously, the compare is difficult. So we expect both iPad and Mac to be down double digits, which offset the acceleration that I mentioned for iPhone and Services.\\nWamsi Mohan: Okay. And Tim, I was wondering if you could update us on what percent of iPhones are sold on some type of installment basis now versus full upfront payment on a global basis. And maybe some thoughts on if you expect similar promotional activity from carriers, especially in the U.S., that seem to be grappling with a lot of cash flow issues this particular year.\\nLuca Maestri: Wamsi, I'll take it. We've done a really good job over the last few years with affordability programs around the world directly in our direct channel and with our partners around the world. The majority of iPhones, at this point, are sold using some kind of a program, trade-ins, installments, some kind of financing. And that percentage, which again, it's well over 50%, is very similar across developed and emerging markets. We want to do more of that because we think it really helps reduce the affordability threshold for our products. And we think it is also one of the reasons why our product mix has been very strong during the last couple of cycles. So we will continue to push on that front.\\nOperator: Our next question is from David Vogt with UBS.\\nDavid Vogt: I just wanted to follow up on 2 points that both you, Tim, and Luca made about growth and maybe commodities. So just to be clear, I know you're talking about an acceleration in iPhone, but the comp is about 2 points easier from FX. So I just want to understand, is that on a like-for-like basis, excluding the currency improvement of about 2 points from the June quarter to the September quarter? And from a commodity perspective, I know last quarter, you talked about buying a lot of inventory at favorable prices, which was an incredibly smart strategy. Where do you sit today? And what's sort of the timing or the duration of that commodity sort of backlog that you have as we think about next quarter and the subsequent quarters? How far does that get you out into the future from this favorable cost dynamic?\\nLuca Maestri: Let me start again. I just want to be clear about the guidance, the outlook guidance that we provided. We're referring entirely to reported numbers. So they take into account the fact that we have a slight improvement in foreign exchange. So when I talk about similar performance, I refer to reported performance in the June quarter and then the reported performance in the September quarter. And again, we expect, on a reported basis, our iPhone performance to accelerate, our Services performance to accelerate, and iPad and Mac to decline double digits. On the commodity front, as I mentioned, the environment is favorable. We always make sure that we take advantage of the opportunities that are available in the market, and we will continue to do that going forward.\\nDavid Vogt: Luca, any sense of how long that gives you a run rate today based on what you currently have? Can you give us a sense for at least the short-term tailwind?\\nLuca Maestri: I don't want to speculate past the September quarter because that's the horizon where we provide guidance. And I've said that the guidance for September is 44% to 45%, which you know is historically very high. And so obviously, that reflects a favorable environment for us.\\nOperator: Our next question is from Erik Woodring with Morgan Stanley.\\nErik Woodring: I have 2 as well. Maybe if we just start kind of big picture, Tim or Luca. I was wondering if you could just kind of share some incremental color on how you think the consumer is behaving today versus 90 days ago and maybe how that differs by region. Meaning, are there any signs that consumer is incrementally more willing to spend on things like consumer electronics? Or is there still relative caution in the market? Are there any regions where you're seeing more strength in the consumer? And how sustainable do you think some of that strength or weakness could be based on some of the KPIs you track? And then I have a follow-up.\\nTim Cook: Yes. David, it's Tim. If you sort of step around the world, we did exceptionally well in emerging markets last quarter and even better on a constant currency basis. And so emerging markets were -- was a strength. If you look at China, in China, we went from a negative 3% in Q2 to a plus 8% in Q3. And so in China, we had an acceleration. If you look at the U.S., which is in the -- obviously in the Americas segment, it is the vast majority of what's in there, there was also a slight acceleration sequentially, although the Americas is still declining somewhat year-over-year, as you can see on the data sheet. The primary reason for that is that it's a challenging smartphone market in the U.S. currently. And then in Europe, Europe saw a record quarter and -- for the June quarter, a record. And so some really good signs in most places in the world.\\nErik Woodring: Awesome. And then maybe, Luca, a question for you. I think it's been about 3 quarters now where we've seen OpEx either grow below historical seasonality or come in below your expectations. I think this is the first time we've seen R&D grow less than 10% year-over-year since fiscal 2Q 2007. So can you maybe just talk about some of the cost actions you're taking? And as you look forward, what are the indicators that you're really evaluating that would give you greater confidence in perhaps returning back to a more seasonal cadence of OpEx spending? Or is this just a new normal that we should be expecting? That's it for me.\\nLuca Maestri: Obviously, we look at the environment, and we know that this has been an uncertain period for the last few quarters. And so we decided to be deliberate in what we do in terms of controlling our spend, and there's many areas across the company that we're working on and we've been quite effective at slowing down the spend. We slowed down also the hiring within the company in several areas. And we're very pleased with our ability to decelerate some of the expense growth taking into account the overall macro situation. We will continue to manage deliberately. You can see that we continue to grow our R&D costs faster than the rest of the company. SG&A is actually growing at a much slower pace because obviously, our focus continues to be in innovation and product development, and we'll continue to do that.\\nOperator: Our next question is from Michael Ng with Goldman Sachs.\\nMichael Ng: I just have 2 questions as well. First, it was encouraging to see the Services outperformance in the quarter, up double digits on an FX-neutral basis, and more Services acceleration next quarter on a reported basis. I was just wondering if you could just talk a little bit more about key underlying drivers for the confidence in the Services acceleration next quarter, understanding that FX a little bit. But anything to call out as it relates to things in Apple Search Ads that's helping. You're obviously making a lot of investments in Apple TV+ between MLS and the Canal+ deal. So any thoughts there would be great.\\nLuca Maestri: Yes, Michael, you're correct. I mean clearly, we've seen an improvement in the June quarter, and we expect further improvement in the September quarter. In June, the performance was across the board. Tim and I mentioned we set records really across the board. We had all-time records in cloud, in video, in AppleCare, in payments and June quarter records in App Store, advertising and Music. So we saw improvement in all our Services categories. We think the situation will continue to improve as we go through September. And that's very positive because not only good for the financial results, but obviously, it shows a high level of engagement of our customers in the ecosystem, which is very important for us. And it's really the sum of all the things that I mentioned in my prepared remarks. It goes from the fact that our installed base continues to grow, so we've got a larger pool of customers, to the fact that our customers are more engaged as we have more transacting accounts and paid accounts on the ecosystem. And the subscriptions business is very healthy with growth of 150 million paid subscriptions just in the last 12 months. It's almost double to what we had 3 years ago. And of course, we are providing more and more content to our users. And so the combination of all these things gives us good confidence for September.\\nMichael Ng: Great. And just as a related follow-up, it's about the hardware installed base and Services ARPU. I was curious when you talked about the Services strength, you talked about the 2 billion-plus installed base. When you think about the opportunity to increase the Services ARPU, do you really think about it internally on a per-active-iPhone user basis or on a per-device basis? Said differently, I'm just curious where you think about -- whether you think there's an incremental opportunity for those users that have multiple devices. Do you really see a big Services ARPU uplift in that respect?\\nLuca Maestri: Well, we know that customers that own more than one device are typically more engaged in our ecosystem. And so obviously, they tend to also spend more on the Services front. I would say the biggest opportunity is that we know that there's a lot of customers that we have that are very familiar with our ecosystem. They are engaged in the ecosystem. But still today, they're using only the portion of the ecosystem that is free. And so we think that by offering better content and more content over time, we're going to be able to attract more of them as paid customers.\\nOperator: Our next question is from Amit Daryanani with Evercore.\\nAmit Daryanani: I have 2 as well. I guess, Luca, maybe if you can talk about Wearables a bit. The growth over there, I think, in constant currency was fairly impressive at plus 6%. Can you just touch on maybe what's driving that? And then how do we think about the Wearables segment heading into the September quarter? I know you talked about a bunch of other ones, but how do we think about Wearables into September as well?\\nLuca Maestri: Sorry, Amit, I didn't get the -- what are you referring to?\\nAmit Daryanani: Yes. Sorry. I was hoping you could talk a bit about the Wearables segment because the growth over there was fairly impressive. And then how do you think about it into September as well?\\nLuca Maestri: Yes. On the Wearables front, we had really good performance in Greater China. And that's, again, very important for us. It was a June quarter record for Greater China. Very important for us because, again, it shows that the engagement with the ecosystem in a market that is so important for us like China continues to grow. It means that there's more and more customers that are owning more than the iPhone. Also, we continue to grow the installed base of the category very quickly because, as I mentioned, 2/3 of every buyer of Apple Watch during the course of the June quarter was new to the product. And so that is all additive to the installed base. So it's just great to see that the AirPods continue to be a great success in the marketplace for us. And so things are moving in the right direction there. It's become a very large business for us in Wearables, Home and Accessories. The last 12 months, we've done $40 billion of business, which is nearly the size of a Fortune 100 company. So it's become very important, and it's allowed us to diversify both our revenues and our earnings.\\nAmit Daryanani: That's really helpful. And then if I could just follow up, the Europe growth, the growth in Europe at up 5% is totally notable as well. I think you have a few emerging markets that you put in Europe as well. But I would love to understand what's happening in Europe and if there's a way to think about sort of Western Europe or developed world versus emerging markets over there.\\nLuca Maestri: Yes. It's been very good, primarily on the emerging market side of Europe. We include India and the Middle East and Central and Eastern Europe into the Europe segment. But as we mentioned at the beginning of the call, we had a number of markets that did very well, like France, like Italy, the Netherlands, Austria. So it was a good quarter for Europe.\\nOperator: Our next question is from Harsh Kumar with Piper Sandler.\\nHarsh Kumar: I have one for Luca and then later on one for Tim. So Luca, for some time now, for many quarters, you've had a currency headwind or foreign exchange currency headwind. It's conceivable that as rates start to come down, hopefully next year that the dollar weakens. Could you take us through the mechanism of how that will work on your revenues and for your costs?\\nLuca Maestri: So we tend -- we try to hedge our foreign exchange exposures because we think it's the right approach for the company in terms of minimizing the volatility that necessarily happens from the movements of currencies. We cannot effectively hedge every single exposure around the world because in some cases, it is not possible. In other cases, it is prohibitively expensive. But we tend to cover all the major currency payers that we have. About 60% of our business is outside the United States. So it's a very, very large and, I would say, very effective hedging program. And so we set up these hedges, and they tend to roll over very regularly. And then we replace them with new hedges at the new spot rate. So the impact that we're going to have on revenue and cost will depend on where the spot rates are at different points in time. And therefore, because of the way the program works, tends to be a bit of a lag in both directions as the foreign exchange moves over time.\\nHarsh Kumar: Understood. Very helpful. And for Tim, Tim, historically, for the last many years, carriers in at least the U.S., which I think is your largest market for iPhone, have had programs to help folks upgrade, whether they give a cash rebate or you bring in your old phone, something like that. I was curious, as you get into your peak December quarter, if you're aware of these programs are in place. And the reason why I'm asking is I think earlier, you mentioned that more than 50% of your phones are sold through some kind of program. I assume the number is even higher in the U.S.\\nTim Cook: I don't want to get into revealing specifics in the different carriers. But generally speaking, I would think that it would be quite easy to find a promotion on a phone, provided you're hooking up to a service and either switching services, carriers or upgrading your phone at the same carrier. I think both of those cases today that you can find promotions out there, and I would expect that you'd be able to do that in the December time frame as well.\\nOperator: Our next question is from Aaron Rakers with Wells Fargo.\\nAaron Rakers: I have two as well. So first of all, I just want to kind of ask Tim. Strategically, as we think about the Services growth and kind of the content expansion behind that, I'm curious if you could help us maybe appreciate what you've seen from a sporting perspective in terms of the engagement with MLS, the engagement with Major League Baseball, and how strategically you're thinking about expansion in sports as a key driver of Services growth going forward.\\nTim Cook: We're focused on original content, as you know, with TV+. And so we're all about giving great storytellers the venue to tell great stories and hopefully get us all to think a little deeper. And sport is a part of that because sport is the ultimate original story. And for MLS, we're -- we could not be happier with how the partnership is going. It's clearly in the early days, but we are beating our expectation in terms of subscribers, and the fact that Messi went to Inter Miami helped us out there a bit. And so we're very excited about it.\\nAaron Rakers: Yes. And as a quick follow-up, I'm just curious, an update on -- you mentioned in your prepared remarks the continued growth that you've seen in India. I'm curious how we think about that market opportunity looking forward. Is there anything that you see evolving that could accelerate the opportunity for iPhone in that large mobile market?\\nTim Cook: We did hit a June quarter revenue record in India, and we grew strong double digits. We also opened our first 2 retail stores during the quarter. And it's -- of course, it's early going currently, but they're currently beating our expectation in terms of how they're doing. We continue to work on building out the channel and putting more investment in our direct-to-consumer offers as well. And so I think if you look at it, it's the second largest smartphone market in the world. And it's -- so we ought to be doing really well there. And where I'm really pleased with our growth there, we're still -- we still have a very, very modest and low share in the smartphone market. And so I think that it's a huge opportunity for us. And we're putting the -- all of our energies in making that occur.\\nOperator: Our next question comes from Sidney Ho with Deutsche Bank.\\nSidney Ho: Your -- I just wanted to ask about the AI side of things. Your strategy on AI seems quite different than many of your peers, at least you don't talk too much about that, how much you invest in it. Maybe you can elaborate a little bit on that. But related to that, how do you see your investment in this area turning into financial performance in the future? Is it mainly through faster upgrade cycle, maybe higher ASP? Or are you thinking about maybe additional services that you can capitalize on that? And then I have a follow-up.\\nTim Cook: If you take a step back, we view AI and machine learning as core fundamental technologies that are integral to virtually every product that we build. And so if you think about WWDC in June, we announced some features that will be coming in iOS 17 this fall, like Personal Voice and Live Voicemail. Previously, we had announced lifesaving features like fall detection and crash detection and ECG. None of these features that I just mentioned and many, many more would be possible without AI and machine learning. And so it's absolutely critical to us. And of course, we've been doing research across a wide range of AI technologies, including generative AI for years. We're going to continue investing and innovating and responsibly advancing our products with these technologies with the goal of enriching people's lives. And so that's what it's all about for us. And as you know, we tend to announce things as they come to market, and that's our MO, and I'd like to stick to that.\\nSidney Ho: Okay. That's fair. Maybe as a follow-up is related to -- you talked about WWDC, where you actually introduced Vision Pro there. Clearly, a very big announcement there. How should we think about the revenue ramp related to the Vision Pro? Is there any catalysts that we should be thinking about that will drive an inflection of that product?\\nTim Cook: Yes. There's enormous excitement around the Vision Pro. We're excited internally. Everybody that's been through the demos are blown away, whether you're talking about press or analysts or developers. We are now shipping units to the developer community for them to begin working on their apps. And we're looking forward to shipping early next year. And so we could not be more excited with that. I'm using the product daily. And so we're not going to forecast revenues and so forth on the call today, but we're very excited about it.\\nOperator: We will take our last question from Krish Sankar with TD Cowen.\\nKrish Sankar: I have two of them as well. Number one, on iPhone, Tim, you mentioned about the record number of switchers in the quarter. I'm kind of curious how to think about, given the weak macro and consumer spending, how is the replacement cycle for iPhone? Is it similar, longer, shorter versus prior years? And can you talk a little bit about the demand linearity of iPhone during the June quarter? And then I have a follow-up.\\nTim Cook: Switchers were a very key part of our iPhone results for the quarter. We did set a record. We set a record in Greater China, in particular, and it was at the heart of our results there. And we continue to try to convince more and more people to switch because of our -- the experience and the ecosystem and -- that we can offer them. And so I think switching is a huge opportunity for us. In terms of the upgrade cycle and so forth, it's very difficult to estimate real time what is going on with the upgrade cycle. I would say, if you think about the iPhone results year-over-year, you have to think about the SE announcement in the year ago quarter, the iPhone SE announcement in the year ago quarter. And so that provides a bit of a headwind on the comp. But as Luca said, as he talked about how we're viewing Q4, the September quarter, we see iPhone accelerating in Q4.\\nKrish Sankar: Got it. Very helpful, Tim. And then my final question is on your retail stores, you obviously have a very large retail footprint and many of your stores seem to have been open for over a year now. How is the foot traffic there? And how do you think about sales or the retail trends in the June quarter and implications for the back half of this year on a seasonality basis?\\nTim Cook: I'm sorry, are you talking about our retail stores?\\nKrish Sankar: Yes, yes, your retail stores.\\nTim Cook: Yes. The -- if you look at retail, it's a key part of our go-to-market approach, and it will be so key and such a competitive advantage with Vision Pro. It will give us the opportunity to launch a new product and demo to many people in the stores. And so it has many advantages in it. And we continue to roll out more stores. As you know, we just opened 2 in India last quarter. We're -- there's still a lot of countries out there that don't have Apple stores that we would like to go into. And so we continue to see it as a key part of how we go to market and love the experience that we can provide customers there.\\nSaori Casey: A replay of today's call will be available for two weeks on Apple Podcasts, at a webcast of apple.com/investor and via telephone. The number for the telephone replay is 866-583-1035. Please enter the confirmation code 2553017, followed by the pound sign. These replays will be available by approximately 5 p.m. Pacific Time today. Members of the press with additional questions can contact Josh Rosenstock at 408-862-1142. Financial analysts can contact me, Saori Casey, with additional questions at 408-974-3123 while Suhasini Chandramouli is on her maternity leave. Thank you again for joining us.\\nOperator: Once again, this does conclude today's conference. We do appreciate your participation.\", 'metadata': {'ticker': 'AAPL', 'quarter': 'Q3', 'date_time': '2023-08-03 21:47:09', 'speakers_list': ['Michael Ng', 'Luca Maestri', 'Saori Casey', 'Harsh Kumar', 'Sidney Ho', 'Aaron Rakers', 'Operator', 'Tim Cook', 'Amit Daryanani', 'Wamsi Mohan', 'Erik Woodring', 'Shannon Cross', 'David Vogt', 'Krish Sankar']}}\n" + ] + } + ], + "source": [ + "print(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Operator: Good day, and welcome to the Apple Q3 Fiscal Year 2023 Earnings Conference Call. Today's call is being recorded. At this time, for opening remarks and introductions, I would like to turn the call over to Saori Casey, Vice President of Finance. Please go ahead.\n", + "Saori Casey: Thank you. Good afternoon, and thank you for joining us. Speaking first today is Apple's CEO, Tim Cook; and he'll be followed by CFO, Luca Maestri. After that, we'll open the call to questions from analysts. Please note that some of the information you'll hear during our discussion today will consist of forward-looking statements, including, without limitation, those regarding revenue, gross margin, operating expenses, other income and expense, taxes, capital allocation and future business outlook, including the potential impact of macroeconomic conditions on the company's business and the results of operations. These statements involve risks and uncertainties that may cause actual results or trends to differ materially from our forecast. For more information, please refer to the risk factors discussed in Apple's most recently filed annual report on Form 10-K and the Form 8-K filed with the SEC today, along with the associated press release. Apple assumes no obligation to update any forward-looking statements, which speak only as of the date they are made. I'd now like to turn the call over to Tim for introductory remarks.\n", + "Tim Cook: Thank you, Saori. Good afternoon, everyone, and thanks for joining us. Today, Apple is reporting revenue of $81.8 billion for the June quarter, better than our expectations. We continued to see strong results in emerging markets, driven by robust sales of iPhone with June quarter total revenue records in India, Indonesia, Mexico, the Philippines, Poland, Saudi Arabia, Turkey and the UAE. We set June quarter records in a number of other countries as well, including France, the Netherlands and Austria. And we set an all-time revenue record in Services driven by more than $1 billion paid subscriptions. We continued to face an uneven macroeconomic environment, including nearly 4 percentage points of foreign exchange headwinds. On a constant currency basis, we grew compared to the prior year's quarter in aggregate and in the majority of markets we track. We continue to manage deliberately and innovate relentlessly, and we are driven by the sense of possibility those efforts inspire. To that end, before I turn to the quarter in more detail, I want to take a moment to acknowledge the unprecedented innovations we were proud to announce at our Worldwide Developers Conference. In addition to extraordinary new Macs and incredible updates to our software platforms, we had the chance to introduce the world to spatial computing. We were so pleased to share the revolutionary Apple Vision Pro with the world, a bold new product unlike anything else created before. Apple Vision Pro is a marvel of engineering, built on decades of innovation only possible at Apple. It is the most advanced personal electronic device ever created, and we've been thrilled by the reaction from press, analysts, developers and content creators who've had the chance to try it. We can't wait to get it into customers' hands early next year. Now let me share more with you on our June quarter results beginning with iPhone. iPhone revenue came in at $39.7 billion for the quarter, down 2% from the year ago quarter's record performance. On a constant currency basis, iPhone revenue grew, and we had a June quarter record for switchers, reflecting the popularity of the iPhone lineup. iPhone 14 customers continue to praise the exceptional battery life and essential health and safety features, while iPhone 14 Plus users are loving the new larger screen size. And with Dynamic Island, Always-On display and the most powerful camera system ever in an iPhone, the iPhone 14 Pro lineup is our best ever. Turning to Mac. We recorded $6.8 billion in revenue, down 7% year-over-year. We are proud to have completed the transition of our entire Mac lineup to run exclusively on Apple silicon. We are also excited to have introduced the new 15-inch MacBook Air during the quarter, the world's best 15-inch laptop and one of the best Macs we've ever made. And we launched 2 new powerhouses in computing, Mac Studio with M2 Max and M2 Ultra and Mac Pro with M2 Ultra, which are the most powerful Macs we've ever made. iPad revenue was $5.8 billion for the June quarter, down 20% year-over-year, in part due to a difficult compare because of the timing of the iPad Air launch last year. Customers are loving iPad's versatility and exceptional value. There was a great deal of excitement from creatives when we brought Final Cut Pro and Logic Pro to iPad this spring. And with the back-to-school season in full swing, iPad has the power to help students tackle the toughest assignments. Across Wearables, Home and Accessories, revenue was $8.3 billion, up 2% year-over-year and in line with our expectations. Packed with features to empower users to live a healthier life, Apple Watch and Apple Watch Ultra continue to help people take the next step on their wellness journey. As I mentioned earlier, last quarter, we held our biggest and most exciting WWDC yet. We were thrilled to welcome developers from across the globe to Apple Park, both in person and virtually, and to share some stunning new announcements with the world. In addition to Apple Vision Pro and the new Macs that we introduced, we had the chance to reveal some truly remarkable new innovations to our software platforms. From exciting new features like Live Voicemail and StandBy in iOS 17, to new tools for users to work, play and personalize their experience in macOS Sonoma and iPadOS 17, to a fresh design and new workout capabilities in watchOS 10, there's so much coming later this year to empower users to get more out of their devices, and we think they're going to instantly love these new features. It was also an exciting quarter for Services where revenue reached $21.2 billion and saw a sequential acceleration to an 8% year-over-year increase, better than we expected. We set an all-time revenue record for total services and in a number of categories, including video, AppleCare, cloud and payment services. Since we introduced Apple Pay almost a decade ago, customers have been loving how easy it is to make purchases online, in apps and in stores. We're also pleased to see Apple Card build on the success of Apple Pay. Designed with our users' financial health in mind, Apple Card has become one of the most successful credit card programs in the U.S. with award-winning customer satisfaction. And this spring, we introduced a new high-yield savings account for Apple Card customers, which has become incredibly popular, with customers already making more than $10 billion in deposits. Meanwhile, Apple TV+ continues to provide a spectacular showcase of imaginative storytelling. Recently, fans welcomed new series like Hijack and Silo as well as returning fan favorites like Foundation and The Afterparty. In the few years since its launch, Apple TV+ has earned more than 1,500 nominations and 370 wins. That includes the 54 Emmy Award nominations across 13 titles that Apple TV+ received last month. It's also been an exciting time for sports on Apple TV+. Soccer legend Lionel Messi made his debut with Major League Soccer last month, and fans all over the world tuned in with MLS Season Pass. We are excited about our MLS partnership, and we're thrilled to see Messi suiting up with Inter Miami. And just in time for summer concert season, Apple Music launched new discovery features celebrating live music, including venue guides in Apple Maps and set lists from tours of major artists. These new features and others join a lineup of updates coming later this year to make Services more powerful, more useful and more fun than ever. Everything we do is in service of our customers, and retail is where we bring the best of Apple. During the quarter, we opened the Apple Store online in Vietnam, and we're excited to connect with more customers there. We also redesigned our first-ever Apple Store located in Tysons Corner, Northern Virginia, with inclusive, innovative and sustainable design enhancements. We opened a beautiful new store beneath our new London headquarters in the historic Battersea Power Station. And the performance of the stores we opened in India this spring exceeded our initial expectations. With every product we create, every feature we develop and every interaction we share with our customers, we lead with the values we stand for. We believe in creating technology that serves all of humanity, which is why accessibility has always been a core value that we embed in everything we do. On Global Accessibility Awareness Day, we unveiled some extraordinary new tools for cognitive, vision, hearing and mobile accessibility that will be available later this year, including Assistive Access, which distills apps to their most essential features, and Personal Voice, which allows users to create a synthesized voice that sounds just like them. Building technology and service of our customers also means protecting their privacy, which we believe is a fundamental human right. That's why we were pleased to announce major updates to Safari Private Browsing, Communication Safety and Lockdown Mode to further safeguard our users. And as part of our efforts to build a better world, we announced that we've more than doubled our initial commitment to our Racial Equity and Justice Initiative to more than $200 million. We will continue to do our part to support education, economic empowerment and criminal justice reform work. And while supporting efforts to advance equity and opportunity, we continue to build a culture of belonging at Apple and a workforce that reflects the communities we serve. Through our environmental work, we're making strides in our commitment to leave the world better than we found it. Last month, Apple joined with global nonprofit Acumen in a new effort to improve livelihoods in India through clean energy innovation, and we are as committed as ever to our Apple 2030 goal to be carbon neutral across our entire supply chain and the life cycle of our products. We've long held that education is the great equalizer. With that in mind, we're expanding Apple Learning Coach, a free professional learning program that teaches educators how to get more out of Apple technology in the classroom. Today, we welcome more than 1,900 educators across the U.S. to the program. By the end of the year, we'll offer Apple Learning Coach in 12 more countries. As we're connecting with teachers, we're also celebrating the graduations of students at our app developer academies around the world. From Detroit, to Naples, to Riyadh and more, we're excited to watch these talented developers embark on careers in coding and find ways to make a positive difference in their communities. Apple remains a champion of innovation, a company fueled by boundless creativity, driven by a deep sense of mission and guided by the unshakable belief that a great idea can change the world. Looking ahead, we'll continue to manage for the long term, always pushing the limits of what's possible and always putting the customer at the center of everything we do. With that, I'll turn it over to Luca.\n", + "Luca Maestri: Thank you, Tim, and good afternoon, everyone. Revenue for the June quarter was $81.8 billion, down 1% from last year and better than our expectations despite nearly 4 percentage points of negative impact from foreign exchange. On a constant currency basis, our revenue grew year-over-year in total and in the majority of the markets we track. We set June quarter records in both Europe and Greater China and continue to see strong performance across our emerging markets driven by iPhone. Products revenue was $60.6 billion, down 4% from last year, as we faced FX headwinds and an uneven macroeconomic environment. However, our installed base reached an all-time high across all geographic segments, driven by a June quarter record for iPhone switchers and high new-to rates in Mac, iPad and Watch, coupled with very high levels of customer satisfaction and loyalty. Our Services revenue set an all-time record of $21.2 billion, up 8% year-over-year and grew double digits in constant currency. Our performance was strong around the world as we reach all-time Services revenue records in Americas and Europe and June quarter records in Greater China and rest of Asia Pacific. Company gross margin was 44.5%, a record level for the June quarter and up 20 basis points sequentially, driven by cost savings and favorable mix shift towards Services, partially offset by a seasonal loss of leverage. Products gross margin was 35.4%, down 130 basis points from last quarter due to seasonal loss of leverage and mix, partially offset by favorable costs. Services gross margin was 70.5%, decreasing 50 basis points sequentially. Operating expenses of $13.4 billion were below the low end of the guidance range we provided at the beginning of the quarter and decelerated from the March quarter. We continue to take a deliberate approach in managing our spend with strong focus on innovation and new product development. The results of these actions delivered net income of $19.9 billion, diluted earnings per share of $1.26, up 5% versus last year, and very strong operating cash flow of $26.4 billion. Let me now provide more detail for each of our revenue categories. iPhone revenue was $39.7 billion, down 2% year-over-year but grew on a constant currency basis. We set revenue records in several markets around the world, including an all-time record in India and June quarter records in Latin America, the Middle East and Africa, Indonesia, the Philippines, Italy, the Netherlands and the U.K. Our iPhone active installed base grew to a new all-time high, thanks to a June quarter record in switchers. This is a testament to our extremely high levels of customer satisfaction, which 451 Research recently measured at 98% for the iPhone 14 family in the U.S. Mac generated $6.8 billion in revenue, down 7% year-over-year. We continue to invest in our Mac portfolio. And this past quarter, we were pleased to complete the transition to Apple silicon for the entire lineup. This transition has driven both strong upgrade activity and a high number of new customers. In fact, almost half of Mac buyers during the quarter were new to the product. We also saw reported customer satisfaction of 96% for Mac in the U.S. iPad revenue was $5.8 billion, down 20% year-over-year and in line with our expectations. These results were driven by a difficult compare against the full quarter impact of the iPad Air launch in the prior year. At the same time, we continue to attract a large number of new customers to the iPad installed base with over half of the customers who purchased iPads during the quarter being new to the product. And the latest reports from 451 Research indicate customer satisfaction of 96% in the U.S. Wearables, Home and Accessories revenue was $8.3 billion, up 2% year-over-year, with a June quarter record in Greater China and strong performance in several emerging markets. We continue to see Apple Watch expand its reach with about 2/3 of customers purchasing an Apple Watch during the quarter being new to the product. And this is combined with very high levels of customer satisfaction, which was recently reported at 98% in the United States. Moving on to Services. We reached a new all-time revenue record of $21.2 billion with year-over-year growth accelerating sequentially to 8% and up double digits in constant currency. In addition to the all-time records Tim mentioned earlier, we also set June quarter records for advertising, App Store and Music. We are very pleased with our performance in Services, which is a direct reflection of our ecosystem's strength. First, our installed base of over 2 billion active devices continues to grow at a nice pace and establishes a solid foundation for the future expansion of our ecosystem. Second, we see increased customer engagement with our services. Both our transacting accounts and paid accounts grew double digits year-over-year, each reaching a new all-time high. Third, our paid subscriptions showed strong growth. This past quarter, we reached an important milestone and passed 1 billion paid subscriptions across the services on our platform, up 150 million during the last 12 months and nearly double the number of paid subscriptions we had only 3 years ago. And finally, we continue to improve the breadth and the quality of our current services. From 20 new games on Apple Arcade, to brand-new content on Apple TV+, to the launch of our high-yield savings account with Apple Card, our customers are loving these enhanced offerings. Turning to the enterprise market. Our customers are leveraging Apple products every day to help improve productivity and attract talent. Blackstone, a global investment management firm, is expanding its Apple footprint from their corporate iPhone fleet to now offering the MacBook Air powered by M2 to all of their corporate employees and portfolio companies. Gilead, a leading biopharmaceutical company, has deployed thousands of iPads globally to their sales team. Over the last 6 months, they have also doubled their Mac user base by making MacBook Air available to more employees with a focus on user experience and strong security. Let me now turn to our cash position and capital return program. We ended the quarter with over $166 billion in cash and marketable securities. We repaid $7.5 billion in maturing debt while issuing $5.2 billion of new debt and increasing commercial paper by $2 billion, leaving us with total debt of $109 billion. As a result, net cash was $57 billion at the end of the quarter. During the quarter, we returned over $24 billion to shareholders, including $3.8 billion in dividends and equivalents and $18 billion through open market repurchases of 103 million Apple shares. We continue to believe there is great value in our stock and maintain our target of reaching a net cash neutral position over time. As we move ahead into the September quarter, I'd like to review our outlook, which includes the types of forward-looking information that Saori referred to at the beginning of the call. We expect our September quarter year-over-year revenue performance to be similar to the June quarter, assuming that the macroeconomic outlook doesn't worsen from what we are projecting today for the current quarter. Foreign exchange will continue to be a headwind, and we expect a negative year-over-year revenue impact of over 2 percentage points. We expect iPhone and Services year-over-year performance to accelerate from the June quarter. Also, we expect the revenue for both Mac and iPad to decline by double digits year-over-year due to difficult compares, particularly on the Mac. For both products, we experienced supply disruptions from factory shutdowns in the June quarter a year ago and were able to fulfill significant pent-up demand in the year ago September quarter. We expect gross margin to be between 44% and 45%. We expect OpEx to be between $13.5 billion and $13.7 billion. We expect OI&E to be around negative $250 million, excluding any potential impact from the mark-to-market of minority investments, and our tax rate to be around 16%. Finally, today, our Board of Directors has declared a cash dividend of $0.24 per share of common stock payable on August 17, 2023, to shareholders of record as of August 14, 2023. With that, let's open the call to questions.\n", + "Saori Casey: Thank you, Luca. [Operator Instructions]. Operator, may we have the first question, please?\n", + "Operator: [Operator Instructions]. We will go ahead and take our first question from Shannon Cross with Credit Suisse.\n", + "Shannon Cross: Tim, you mentioned -- and actually, Luca, too, you mentioned an uneven macro environment during the quarter several times on the call. I'm wondering if you can talk on a geographic basis about some of the trends you're seeing in iPhone. I'm specifically wondering how demand is trending within...\n", + "Luca Maestri: Sure. Shannon, I'll answer it. I didn't get the end of your question.\n", + "Operator: I think she has dropped.\n", + "Luca Maestri: Okay. Well, let me answer the question for the part that I could follow. So on a geographic basis, we've had great performance for iPhone in emerging markets. We set June quarter records in many of the emerging markets. We grew in total double digits. And the performance was strong across the board in emerging markets from China, where our performance improved from minus 3% to plus 8% in the June quarter and we grew double digits in constant currency, to many other areas around the world from India, where, again, we set a June quarter record with very strong performance there, Indonesia, Southeast Asia, in general, Latin America, Middle East. And so it's been really good there. We -- also, as you can see from our geographic segments, we had a slight acceleration of performance in the Americas, primarily in the United States, but we declined there because the smartphone market has been in a decline for the last couple of quarters in the United States.\n", + "Shannon Cross: Sorry about that. I'm not sure why I cut off. In terms of gross margin, you were at the high end of the range [Technical Difficulty] and you guided to 45% at the high end, which is, I think, higher than I remember in 20 years of covering you. So how should we think about puts and takes of gross margin? And it seems like there's like a perfect storm of good things. So I just -- maybe if you can talk about how you're thinking about it more holistically.\n", + "Luca Maestri: I think you remember correctly, Shannon, because the 44.5% for the June quarter is an all-time record for us in June. We were up 20 basis points sequentially. It was driven by cost savings and a mix shift towards Services, which obviously helps company gross margins, partially offset by the seasonal loss of leverage. We have a commodity environment that is favorable to us. Our product mix is quite strong at this point. And so with the exception of foreign exchange, which continues to be a drag, and it was a significant drag on a year-over-year basis, yes, we are in a good position right now. We are in a good position for the June quarter. And as I mentioned, we expect similar level of gross margins for the same reasons, frankly, for the September quarter.\n", + "Operator: Our next question comes from Wamsi Mohan of Bank of America.\n", + "Wamsi Mohan: Luca, can you just give us a little more color around the guidance? Your overall revenue performance, you called out similar. Obviously, you absorbed a higher FX impact this quarter versus your guide. And you also noted Services acceleration. So just wondering, when you think about that comment on iPhone acceleration, is that on a reported basis? Is that constant currency basis? And is there something that's changing in terms of seasonality perhaps for you that is causing not as much step-up in product revenue as typical on a sequential basis? And I have a follow-up.\n", + "Luca Maestri: Yes. So all our comments are in reported currency, not in constant currency in relation to the outlook. And we said acceleration sequentially for iPhone and for Services. But we're also pointing out -- and this is where I think, Wamsi, you're referring to some seasonality issues. We also said that for Mac and iPad, we expect to decline double digits. And the reason for that is that we have a very difficult compare versus last year. You remember that a year ago, in the June quarter, we had factory shutdowns for both Mac and iPad. And so we were able to fill the pent-up demand from those shutdowns during the September quarter. So an unusual level of activity that we had a year ago. And so now, obviously, the compare is difficult. So we expect both iPad and Mac to be down double digits, which offset the acceleration that I mentioned for iPhone and Services.\n", + "Wamsi Mohan: Okay. And Tim, I was wondering if you could update us on what percent of iPhones are sold on some type of installment basis now versus full upfront payment on a global basis. And maybe some thoughts on if you expect similar promotional activity from carriers, especially in the U.S., that seem to be grappling with a lot of cash flow issues this particular year.\n", + "Luca Maestri: Wamsi, I'll take it. We've done a really good job over the last few years with affordability programs around the world directly in our direct channel and with our partners around the world. The majority of iPhones, at this point, are sold using some kind of a program, trade-ins, installments, some kind of financing. And that percentage, which again, it's well over 50%, is very similar across developed and emerging markets. We want to do more of that because we think it really helps reduce the affordability threshold for our products. And we think it is also one of the reasons why our product mix has been very strong during the last couple of cycles. So we will continue to push on that front.\n", + "Operator: Our next question is from David Vogt with UBS.\n", + "David Vogt: I just wanted to follow up on 2 points that both you, Tim, and Luca made about growth and maybe commodities. So just to be clear, I know you're talking about an acceleration in iPhone, but the comp is about 2 points easier from FX. So I just want to understand, is that on a like-for-like basis, excluding the currency improvement of about 2 points from the June quarter to the September quarter? And from a commodity perspective, I know last quarter, you talked about buying a lot of inventory at favorable prices, which was an incredibly smart strategy. Where do you sit today? And what's sort of the timing or the duration of that commodity sort of backlog that you have as we think about next quarter and the subsequent quarters? How far does that get you out into the future from this favorable cost dynamic?\n", + "Luca Maestri: Let me start again. I just want to be clear about the guidance, the outlook guidance that we provided. We're referring entirely to reported numbers. So they take into account the fact that we have a slight improvement in foreign exchange. So when I talk about similar performance, I refer to reported performance in the June quarter and then the reported performance in the September quarter. And again, we expect, on a reported basis, our iPhone performance to accelerate, our Services performance to accelerate, and iPad and Mac to decline double digits. On the commodity front, as I mentioned, the environment is favorable. We always make sure that we take advantage of the opportunities that are available in the market, and we will continue to do that going forward.\n", + "David Vogt: Luca, any sense of how long that gives you a run rate today based on what you currently have? Can you give us a sense for at least the short-term tailwind?\n", + "Luca Maestri: I don't want to speculate past the September quarter because that's the horizon where we provide guidance. And I've said that the guidance for September is 44% to 45%, which you know is historically very high. And so obviously, that reflects a favorable environment for us.\n", + "Operator: Our next question is from Erik Woodring with Morgan Stanley.\n", + "Erik Woodring: I have 2 as well. Maybe if we just start kind of big picture, Tim or Luca. I was wondering if you could just kind of share some incremental color on how you think the consumer is behaving today versus 90 days ago and maybe how that differs by region. Meaning, are there any signs that consumer is incrementally more willing to spend on things like consumer electronics? Or is there still relative caution in the market? Are there any regions where you're seeing more strength in the consumer? And how sustainable do you think some of that strength or weakness could be based on some of the KPIs you track? And then I have a follow-up.\n", + "Tim Cook: Yes. David, it's Tim. If you sort of step around the world, we did exceptionally well in emerging markets last quarter and even better on a constant currency basis. And so emerging markets were -- was a strength. If you look at China, in China, we went from a negative 3% in Q2 to a plus 8% in Q3. And so in China, we had an acceleration. If you look at the U.S., which is in the -- obviously in the Americas segment, it is the vast majority of what's in there, there was also a slight acceleration sequentially, although the Americas is still declining somewhat year-over-year, as you can see on the data sheet. The primary reason for that is that it's a challenging smartphone market in the U.S. currently. And then in Europe, Europe saw a record quarter and -- for the June quarter, a record. And so some really good signs in most places in the world.\n", + "Erik Woodring: Awesome. And then maybe, Luca, a question for you. I think it's been about 3 quarters now where we've seen OpEx either grow below historical seasonality or come in below your expectations. I think this is the first time we've seen R&D grow less than 10% year-over-year since fiscal 2Q 2007. So can you maybe just talk about some of the cost actions you're taking? And as you look forward, what are the indicators that you're really evaluating that would give you greater confidence in perhaps returning back to a more seasonal cadence of OpEx spending? Or is this just a new normal that we should be expecting? That's it for me.\n", + "Luca Maestri: Obviously, we look at the environment, and we know that this has been an uncertain period for the last few quarters. And so we decided to be deliberate in what we do in terms of controlling our spend, and there's many areas across the company that we're working on and we've been quite effective at slowing down the spend. We slowed down also the hiring within the company in several areas. And we're very pleased with our ability to decelerate some of the expense growth taking into account the overall macro situation. We will continue to manage deliberately. You can see that we continue to grow our R&D costs faster than the rest of the company. SG&A is actually growing at a much slower pace because obviously, our focus continues to be in innovation and product development, and we'll continue to do that.\n", + "Operator: Our next question is from Michael Ng with Goldman Sachs.\n", + "Michael Ng: I just have 2 questions as well. First, it was encouraging to see the Services outperformance in the quarter, up double digits on an FX-neutral basis, and more Services acceleration next quarter on a reported basis. I was just wondering if you could just talk a little bit more about key underlying drivers for the confidence in the Services acceleration next quarter, understanding that FX a little bit. But anything to call out as it relates to things in Apple Search Ads that's helping. You're obviously making a lot of investments in Apple TV+ between MLS and the Canal+ deal. So any thoughts there would be great.\n", + "Luca Maestri: Yes, Michael, you're correct. I mean clearly, we've seen an improvement in the June quarter, and we expect further improvement in the September quarter. In June, the performance was across the board. Tim and I mentioned we set records really across the board. We had all-time records in cloud, in video, in AppleCare, in payments and June quarter records in App Store, advertising and Music. So we saw improvement in all our Services categories. We think the situation will continue to improve as we go through September. And that's very positive because not only good for the financial results, but obviously, it shows a high level of engagement of our customers in the ecosystem, which is very important for us. And it's really the sum of all the things that I mentioned in my prepared remarks. It goes from the fact that our installed base continues to grow, so we've got a larger pool of customers, to the fact that our customers are more engaged as we have more transacting accounts and paid accounts on the ecosystem. And the subscriptions business is very healthy with growth of 150 million paid subscriptions just in the last 12 months. It's almost double to what we had 3 years ago. And of course, we are providing more and more content to our users. And so the combination of all these things gives us good confidence for September.\n", + "Michael Ng: Great. And just as a related follow-up, it's about the hardware installed base and Services ARPU. I was curious when you talked about the Services strength, you talked about the 2 billion-plus installed base. When you think about the opportunity to increase the Services ARPU, do you really think about it internally on a per-active-iPhone user basis or on a per-device basis? Said differently, I'm just curious where you think about -- whether you think there's an incremental opportunity for those users that have multiple devices. Do you really see a big Services ARPU uplift in that respect?\n", + "Luca Maestri: Well, we know that customers that own more than one device are typically more engaged in our ecosystem. And so obviously, they tend to also spend more on the Services front. I would say the biggest opportunity is that we know that there's a lot of customers that we have that are very familiar with our ecosystem. They are engaged in the ecosystem. But still today, they're using only the portion of the ecosystem that is free. And so we think that by offering better content and more content over time, we're going to be able to attract more of them as paid customers.\n", + "Operator: Our next question is from Amit Daryanani with Evercore.\n", + "Amit Daryanani: I have 2 as well. I guess, Luca, maybe if you can talk about Wearables a bit. The growth over there, I think, in constant currency was fairly impressive at plus 6%. Can you just touch on maybe what's driving that? And then how do we think about the Wearables segment heading into the September quarter? I know you talked about a bunch of other ones, but how do we think about Wearables into September as well?\n", + "Luca Maestri: Sorry, Amit, I didn't get the -- what are you referring to?\n", + "Amit Daryanani: Yes. Sorry. I was hoping you could talk a bit about the Wearables segment because the growth over there was fairly impressive. And then how do you think about it into September as well?\n", + "Luca Maestri: Yes. On the Wearables front, we had really good performance in Greater China. And that's, again, very important for us. It was a June quarter record for Greater China. Very important for us because, again, it shows that the engagement with the ecosystem in a market that is so important for us like China continues to grow. It means that there's more and more customers that are owning more than the iPhone. Also, we continue to grow the installed base of the category very quickly because, as I mentioned, 2/3 of every buyer of Apple Watch during the course of the June quarter was new to the product. And so that is all additive to the installed base. So it's just great to see that the AirPods continue to be a great success in the marketplace for us. And so things are moving in the right direction there. It's become a very large business for us in Wearables, Home and Accessories. The last 12 months, we've done $40 billion of business, which is nearly the size of a Fortune 100 company. So it's become very important, and it's allowed us to diversify both our revenues and our earnings.\n", + "Amit Daryanani: That's really helpful. And then if I could just follow up, the Europe growth, the growth in Europe at up 5% is totally notable as well. I think you have a few emerging markets that you put in Europe as well. But I would love to understand what's happening in Europe and if there's a way to think about sort of Western Europe or developed world versus emerging markets over there.\n", + "Luca Maestri: Yes. It's been very good, primarily on the emerging market side of Europe. We include India and the Middle East and Central and Eastern Europe into the Europe segment. But as we mentioned at the beginning of the call, we had a number of markets that did very well, like France, like Italy, the Netherlands, Austria. So it was a good quarter for Europe.\n", + "Operator: Our next question is from Harsh Kumar with Piper Sandler.\n", + "Harsh Kumar: I have one for Luca and then later on one for Tim. So Luca, for some time now, for many quarters, you've had a currency headwind or foreign exchange currency headwind. It's conceivable that as rates start to come down, hopefully next year that the dollar weakens. Could you take us through the mechanism of how that will work on your revenues and for your costs?\n", + "Luca Maestri: So we tend -- we try to hedge our foreign exchange exposures because we think it's the right approach for the company in terms of minimizing the volatility that necessarily happens from the movements of currencies. We cannot effectively hedge every single exposure around the world because in some cases, it is not possible. In other cases, it is prohibitively expensive. But we tend to cover all the major currency payers that we have. About 60% of our business is outside the United States. So it's a very, very large and, I would say, very effective hedging program. And so we set up these hedges, and they tend to roll over very regularly. And then we replace them with new hedges at the new spot rate. So the impact that we're going to have on revenue and cost will depend on where the spot rates are at different points in time. And therefore, because of the way the program works, tends to be a bit of a lag in both directions as the foreign exchange moves over time.\n", + "Harsh Kumar: Understood. Very helpful. And for Tim, Tim, historically, for the last many years, carriers in at least the U.S., which I think is your largest market for iPhone, have had programs to help folks upgrade, whether they give a cash rebate or you bring in your old phone, something like that. I was curious, as you get into your peak December quarter, if you're aware of these programs are in place. And the reason why I'm asking is I think earlier, you mentioned that more than 50% of your phones are sold through some kind of program. I assume the number is even higher in the U.S.\n", + "Tim Cook: I don't want to get into revealing specifics in the different carriers. But generally speaking, I would think that it would be quite easy to find a promotion on a phone, provided you're hooking up to a service and either switching services, carriers or upgrading your phone at the same carrier. I think both of those cases today that you can find promotions out there, and I would expect that you'd be able to do that in the December time frame as well.\n", + "Operator: Our next question is from Aaron Rakers with Wells Fargo.\n", + "Aaron Rakers: I have two as well. So first of all, I just want to kind of ask Tim. Strategically, as we think about the Services growth and kind of the content expansion behind that, I'm curious if you could help us maybe appreciate what you've seen from a sporting perspective in terms of the engagement with MLS, the engagement with Major League Baseball, and how strategically you're thinking about expansion in sports as a key driver of Services growth going forward.\n", + "Tim Cook: We're focused on original content, as you know, with TV+. And so we're all about giving great storytellers the venue to tell great stories and hopefully get us all to think a little deeper. And sport is a part of that because sport is the ultimate original story. And for MLS, we're -- we could not be happier with how the partnership is going. It's clearly in the early days, but we are beating our expectation in terms of subscribers, and the fact that Messi went to Inter Miami helped us out there a bit. And so we're very excited about it.\n", + "Aaron Rakers: Yes. And as a quick follow-up, I'm just curious, an update on -- you mentioned in your prepared remarks the continued growth that you've seen in India. I'm curious how we think about that market opportunity looking forward. Is there anything that you see evolving that could accelerate the opportunity for iPhone in that large mobile market?\n", + "Tim Cook: We did hit a June quarter revenue record in India, and we grew strong double digits. We also opened our first 2 retail stores during the quarter. And it's -- of course, it's early going currently, but they're currently beating our expectation in terms of how they're doing. We continue to work on building out the channel and putting more investment in our direct-to-consumer offers as well. And so I think if you look at it, it's the second largest smartphone market in the world. And it's -- so we ought to be doing really well there. And where I'm really pleased with our growth there, we're still -- we still have a very, very modest and low share in the smartphone market. And so I think that it's a huge opportunity for us. And we're putting the -- all of our energies in making that occur.\n", + "Operator: Our next question comes from Sidney Ho with Deutsche Bank.\n", + "Sidney Ho: Your -- I just wanted to ask about the AI side of things. Your strategy on AI seems quite different than many of your peers, at least you don't talk too much about that, how much you invest in it. Maybe you can elaborate a little bit on that. But related to that, how do you see your investment in this area turning into financial performance in the future? Is it mainly through faster upgrade cycle, maybe higher ASP? Or are you thinking about maybe additional services that you can capitalize on that? And then I have a follow-up.\n", + "Tim Cook: If you take a step back, we view AI and machine learning as core fundamental technologies that are integral to virtually every product that we build. And so if you think about WWDC in June, we announced some features that will be coming in iOS 17 this fall, like Personal Voice and Live Voicemail. Previously, we had announced lifesaving features like fall detection and crash detection and ECG. None of these features that I just mentioned and many, many more would be possible without AI and machine learning. And so it's absolutely critical to us. And of course, we've been doing research across a wide range of AI technologies, including generative AI for years. We're going to continue investing and innovating and responsibly advancing our products with these technologies with the goal of enriching people's lives. And so that's what it's all about for us. And as you know, we tend to announce things as they come to market, and that's our MO, and I'd like to stick to that.\n", + "Sidney Ho: Okay. That's fair. Maybe as a follow-up is related to -- you talked about WWDC, where you actually introduced Vision Pro there. Clearly, a very big announcement there. How should we think about the revenue ramp related to the Vision Pro? Is there any catalysts that we should be thinking about that will drive an inflection of that product?\n", + "Tim Cook: Yes. There's enormous excitement around the Vision Pro. We're excited internally. Everybody that's been through the demos are blown away, whether you're talking about press or analysts or developers. We are now shipping units to the developer community for them to begin working on their apps. And we're looking forward to shipping early next year. And so we could not be more excited with that. I'm using the product daily. And so we're not going to forecast revenues and so forth on the call today, but we're very excited about it.\n", + "Operator: We will take our last question from Krish Sankar with TD Cowen.\n", + "Krish Sankar: I have two of them as well. Number one, on iPhone, Tim, you mentioned about the record number of switchers in the quarter. I'm kind of curious how to think about, given the weak macro and consumer spending, how is the replacement cycle for iPhone? Is it similar, longer, shorter versus prior years? And can you talk a little bit about the demand linearity of iPhone during the June quarter? And then I have a follow-up.\n", + "Tim Cook: Switchers were a very key part of our iPhone results for the quarter. We did set a record. We set a record in Greater China, in particular, and it was at the heart of our results there. And we continue to try to convince more and more people to switch because of our -- the experience and the ecosystem and -- that we can offer them. And so I think switching is a huge opportunity for us. In terms of the upgrade cycle and so forth, it's very difficult to estimate real time what is going on with the upgrade cycle. I would say, if you think about the iPhone results year-over-year, you have to think about the SE announcement in the year ago quarter, the iPhone SE announcement in the year ago quarter. And so that provides a bit of a headwind on the comp. But as Luca said, as he talked about how we're viewing Q4, the September quarter, we see iPhone accelerating in Q4.\n", + "Krish Sankar: Got it. Very helpful, Tim. And then my final question is on your retail stores, you obviously have a very large retail footprint and many of your stores seem to have been open for over a year now. How is the foot traffic there? And how do you think about sales or the retail trends in the June quarter and implications for the back half of this year on a seasonality basis?\n", + "Tim Cook: I'm sorry, are you talking about our retail stores?\n", + "Krish Sankar: Yes, yes, your retail stores.\n", + "Tim Cook: Yes. The -- if you look at retail, it's a key part of our go-to-market approach, and it will be so key and such a competitive advantage with Vision Pro. It will give us the opportunity to launch a new product and demo to many people in the stores. And so it has many advantages in it. And we continue to roll out more stores. As you know, we just opened 2 in India last quarter. We're -- there's still a lot of countries out there that don't have Apple stores that we would like to go into. And so we continue to see it as a key part of how we go to market and love the experience that we can provide customers there.\n", + "Saori Casey: A replay of today's call will be available for two weeks on Apple Podcasts, at a webcast of apple.com/investor and via telephone. The number for the telephone replay is 866-583-1035. Please enter the confirmation code 2553017, followed by the pound sign. These replays will be available by approximately 5 p.m. Pacific Time today. Members of the press with additional questions can contact Josh Rosenstock at 408-862-1142. Financial analysts can contact me, Saori Casey, with additional questions at 408-974-3123 while Suhasini Chandramouli is on her maternity leave. Thank you again for joining us.\n", + "Operator: Once again, this does conclude today's conference. We do appreciate your participation.\n" + ] + } + ], + "source": [ + "print(docs['text'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'ticker': 'AAPL', 'quarter': 'Q3', 'date_time': '2023-08-03 21:47:09', 'speakers_list': ['Michael Ng', 'Luca Maestri', 'Saori Casey', 'Harsh Kumar', 'Sidney Ho', 'Aaron Rakers', 'Operator', 'Tim Cook', 'Amit Daryanani', 'Wamsi Mohan', 'Erik Woodring', 'Shannon Cross', 'David Vogt', 'Krish Sankar']}\n" + ] + } + ], + "source": [ + "print(docs['metadata'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_News.ipynb b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_News.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..98ce774188e76fc27f4ed7fb8dc05f93ee6e4f1b --- /dev/null +++ b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_News.ipynb @@ -0,0 +1,3949 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../FinNLP\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CNBS" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.cnbc_streaming import CNBC_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = CNBC_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(30, 30)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['description', 'cn:lastPubDate', 'dateModified', 'cn:dateline',\n", + " 'cn:branding', 'section', 'cn:type', 'author', 'cn:source',\n", + " 'cn:subtype', 'duration', 'summary', 'expires', 'cn:sectionSubType',\n", + " 'cn:contentClassification', 'pubdateunix', '_id', 'url', '@id',\n", + " 'datePublished', 'cn:promoImage', 'cn:title', 'cn:keyword',\n", + " 'cn:liveURL', '_pubDate', '_type', '_index', 'brand', 'hint',\n", + " 'hint_detail'],\n", + " dtype='object')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
descriptioncn:lastPubDatedateModifiedcn:datelinecn:brandingsectioncn:typeauthorcn:sourcecn:subtype...cn:promoImagecn:titlecn:keywordcn:liveURL_pubDate_type_indexbrandhinthint_detail
0While Leah Ellis was earning her doctorate at ...2023-06-24T10:00:01+00002023-06-24T10:00:01+0000cnbcClean TechcnbcnewsstoryCatherine Clifford[]...https://image.cnbcfm.com/api/v1/image/10726095...Meet the 33-year-old Canadian chemist and the ...https://www.cnbc.com/2023/06/24/sublime-system...6/24/2023 10:00:01 PM00cnbcNaNNaN
1Amazon.com said on Friday it will take its inv...2023-06-24T04:50:41+00002023-06-24T04:50:41+0000cnbcTechnologywirestory[]...https://image.cnbcfm.com/api/v1/image/10726178...Amazon raises investment in India to $26 billi...https://www.cnbc.com/2023/06/24/amazon-commits...6/24/2023 1:49:10 PM01cnbcNaNNaN
\n", + "

2 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " description \\\n", + "0 While Leah Ellis was earning her doctorate at ... \n", + "1 Amazon.com said on Friday it will take its inv... \n", + "\n", + " cn:lastPubDate dateModified cn:dateline cn:branding \\\n", + "0 2023-06-24T10:00:01+0000 2023-06-24T10:00:01+0000 cnbc \n", + "1 2023-06-24T04:50:41+0000 2023-06-24T04:50:41+0000 cnbc \n", + "\n", + " section cn:type author cn:source cn:subtype ... \\\n", + "0 Clean Tech cnbcnewsstory Catherine Clifford [] ... \n", + "1 Technology wirestory [] ... \n", + "\n", + " cn:promoImage \\\n", + "0 https://image.cnbcfm.com/api/v1/image/10726095... \n", + "1 https://image.cnbcfm.com/api/v1/image/10726178... \n", + "\n", + " cn:title cn:keyword \\\n", + "0 Meet the 33-year-old Canadian chemist and the ... \n", + "1 Amazon raises investment in India to $26 billi... \n", + "\n", + " cn:liveURL _pubDate \\\n", + "0 https://www.cnbc.com/2023/06/24/sublime-system... 6/24/2023 10:00:01 PM \n", + "1 https://www.cnbc.com/2023/06/24/amazon-commits... 6/24/2023 1:49:10 PM \n", + "\n", + " _type _index brand hint hint_detail \n", + "0 0 0 cnbc NaN NaN \n", + "1 0 1 cnbc NaN NaN \n", + "\n", + "[2 rows x 30 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datePublishedcn:lastPubDatedateModifieddescriptionsectionauthorsummarycn:titlecn:type
02023-06-24T14:00:01+00002023-06-24T10:00:01+00002023-06-24T10:00:01+0000While Leah Ellis was earning her doctorate at ...Clean TechCatherine CliffordSublime Systems is scaling up a green cement. ...Meet the 33-year-old Canadian chemist and the ...cnbcnewsstory
12023-06-24T05:49:10+00002023-06-24T04:50:41+00002023-06-24T04:50:41+0000Amazon.com said on Friday it will take its inv...TechnologyModi and Jassy spoke about supporting Indian s...Amazon raises investment in India to $26 billi...wirestory
22023-06-23T22:12:07+00002023-06-23T18:29:45+00002023-06-23T18:29:45+0000As Microsoft attempts to convince regulators t...TechnologyJordan NovetMicrosoft has been eager to grow in mobile gam...Microsoft says it looked at acquiring Zynga bu...cnbcnewsstory
32023-06-23T21:51:15+00002023-06-23T17:51:15+00002023-06-23T17:51:15+0000The CEOs of Apple, Alphabet, Microsoft got a h...TechnologySeema ModyTop tech execs met with Indian Prime Minister ...Apple's Tim Cook calls India 'huge opportunity...cnbcnewsstory
42023-06-23T17:32:48+00002023-06-23T13:36:59+00002023-06-23T13:36:59+0000Tech executives like Apple CEO Tim Cook visit ...Fast Money Halftime ReportSeema ModyTech executives like Apple CEO Tim Cook visit ...Tech CEOs meet President Biden and Indian PM M...cnbcvideo
52023-06-23T12:30:17+00002023-06-23T13:32:42+00002023-06-23T13:32:42+0000Anyone want to buy or sell this tech rally? To...Pro: Pro ColumnistsBob PisaniFollowing the rebalancing of S&P indexes last ...Friday could offer a once-in-a-year chance to ...cnbcnewsstory
62023-06-23T12:26:42+00002023-06-23T11:47:06+00002023-06-23T11:47:06+0000Here are Friday's biggest calls on Wall Street...Pro: Analyst Stock PicksMichael BloomHere are Friday's biggest calls on Wall Street.Here are Friday's biggest analyst calls: Meta,...cnbcnewsstory
72023-06-23T06:30:01+00002023-06-23T02:30:01+00002023-06-23T02:30:01+0000This report is from today's CNBC Daily Open, o...Daily OpenYeo Boon PingInvestors have been lulled by a sense of secur...CNBC Daily Open: Seeking shelter in techcnbcnewsstory
82023-06-23T05:45:33+00002023-06-23T10:37:42+00002023-06-23T10:37:42+0000AMSTERDAM — Artificial intelligence has a raci...TechnologyRyan BrowneWhen it comes to banking and financial service...A.I. has a discrimination problem. In banking,...cnbcnewsstory
92023-06-22T23:43:01+00002023-06-23T01:01:10+00002023-06-23T01:01:10+0000This report is from today's CNBC Daily Open, o...Daily OpenYeo Boon PingInvestors have been lulled by a sense of secur...CNBC Daily Open: Rate hikes and red lightscnbcnewsstory
\n", + "
" + ], + "text/plain": [ + " datePublished cn:lastPubDate \\\n", + "0 2023-06-24T14:00:01+0000 2023-06-24T10:00:01+0000 \n", + "1 2023-06-24T05:49:10+0000 2023-06-24T04:50:41+0000 \n", + "2 2023-06-23T22:12:07+0000 2023-06-23T18:29:45+0000 \n", + "3 2023-06-23T21:51:15+0000 2023-06-23T17:51:15+0000 \n", + "4 2023-06-23T17:32:48+0000 2023-06-23T13:36:59+0000 \n", + "5 2023-06-23T12:30:17+0000 2023-06-23T13:32:42+0000 \n", + "6 2023-06-23T12:26:42+0000 2023-06-23T11:47:06+0000 \n", + "7 2023-06-23T06:30:01+0000 2023-06-23T02:30:01+0000 \n", + "8 2023-06-23T05:45:33+0000 2023-06-23T10:37:42+0000 \n", + "9 2023-06-22T23:43:01+0000 2023-06-23T01:01:10+0000 \n", + "\n", + " dateModified \\\n", + "0 2023-06-24T10:00:01+0000 \n", + "1 2023-06-24T04:50:41+0000 \n", + "2 2023-06-23T18:29:45+0000 \n", + "3 2023-06-23T17:51:15+0000 \n", + "4 2023-06-23T13:36:59+0000 \n", + "5 2023-06-23T13:32:42+0000 \n", + "6 2023-06-23T11:47:06+0000 \n", + "7 2023-06-23T02:30:01+0000 \n", + "8 2023-06-23T10:37:42+0000 \n", + "9 2023-06-23T01:01:10+0000 \n", + "\n", + " description \\\n", + "0 While Leah Ellis was earning her doctorate at ... \n", + "1 Amazon.com said on Friday it will take its inv... \n", + "2 As Microsoft attempts to convince regulators t... \n", + "3 The CEOs of Apple, Alphabet, Microsoft got a h... \n", + "4 Tech executives like Apple CEO Tim Cook visit ... \n", + "5 Anyone want to buy or sell this tech rally? To... \n", + "6 Here are Friday's biggest calls on Wall Street... \n", + "7 This report is from today's CNBC Daily Open, o... \n", + "8 AMSTERDAM — Artificial intelligence has a raci... \n", + "9 This report is from today's CNBC Daily Open, o... \n", + "\n", + " section author \\\n", + "0 Clean Tech Catherine Clifford \n", + "1 Technology \n", + "2 Technology Jordan Novet \n", + "3 Technology Seema Mody \n", + "4 Fast Money Halftime Report Seema Mody \n", + "5 Pro: Pro Columnists Bob Pisani \n", + "6 Pro: Analyst Stock Picks Michael Bloom \n", + "7 Daily Open Yeo Boon Ping \n", + "8 Technology Ryan Browne \n", + "9 Daily Open Yeo Boon Ping \n", + "\n", + " summary \\\n", + "0 Sublime Systems is scaling up a green cement. ... \n", + "1 Modi and Jassy spoke about supporting Indian s... \n", + "2 Microsoft has been eager to grow in mobile gam... \n", + "3 Top tech execs met with Indian Prime Minister ... \n", + "4 Tech executives like Apple CEO Tim Cook visit ... \n", + "5 Following the rebalancing of S&P indexes last ... \n", + "6 Here are Friday's biggest calls on Wall Street. \n", + "7 Investors have been lulled by a sense of secur... \n", + "8 When it comes to banking and financial service... \n", + "9 Investors have been lulled by a sense of secur... \n", + "\n", + " cn:title cn:type \n", + "0 Meet the 33-year-old Canadian chemist and the ... cnbcnewsstory \n", + "1 Amazon raises investment in India to $26 billi... wirestory \n", + "2 Microsoft says it looked at acquiring Zynga bu... cnbcnewsstory \n", + "3 Apple's Tim Cook calls India 'huge opportunity... cnbcnewsstory \n", + "4 Tech CEOs meet President Biden and Indian PM M... cnbcvideo \n", + "5 Friday could offer a once-in-a-year chance to ... cnbcnewsstory \n", + "6 Here are Friday's biggest analyst calls: Meta,... cnbcnewsstory \n", + "7 CNBC Daily Open: Seeking shelter in tech cnbcnewsstory \n", + "8 A.I. has a discrimination problem. In banking,... cnbcnewsstory \n", + "9 CNBC Daily Open: Rate hikes and red lights cnbcnewsstory " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"datePublished\", \"cn:lastPubDate\", \"dateModified\", \"description\", \"section\" ,\"author\", \"summary\" , \"cn:title\", \"cn:type\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Yicai / 第一财经" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.yicai_streaming import Yicai_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = Yicai_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"茅台\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60, 13)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorchannelidcreationDatedescidpreviewImagesourcetagstitletopicstypeourlweight
010000032006-21 11:41101788593第一财经北斗星通;游资;龙虎;买入;通信机构抄底超讯通信 游资封板北斗星通丨龙虎榜10/news/101788593.html50
1[周艾琳]5306-20 21:552003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内...1017881832023/06/e42c4bda8cc367f523764c90447ab5a3.jpg第一财经外资;A股;基金;QFII;RQFII;瑞银QFII投资A股走过20年,外资驶向何方?10/news/101788183.html50
\n", + "
" + ], + "text/plain": [ + " author channelid creationDate \\\n", + "0 100000320 06-21 11:41 \n", + "1 [周艾琳] 53 06-20 21:55 \n", + "\n", + " desc id \\\n", + "0 101788593 \n", + "1 2003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内... 101788183 \n", + "\n", + " previewImage source \\\n", + "0 第一财经 \n", + "1 2023/06/e42c4bda8cc367f523764c90447ab5a3.jpg 第一财经 \n", + "\n", + " tags title topics typeo \\\n", + "0 北斗星通;游资;龙虎;买入;通信 机构抄底超讯通信 游资封板北斗星通丨龙虎榜 10 \n", + "1 外资;A股;基金;QFII;RQFII;瑞银 QFII投资A股走过20年,外资驶向何方? 10 \n", + "\n", + " url weight \n", + "0 /news/101788593.html 50 \n", + "1 /news/101788183.html 50 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorcreationDatedescsourcetitle
006-21 11:41第一财经机构抄底超讯通信 游资封板北斗星通丨龙虎榜
1[周艾琳]06-20 21:552003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内...第一财经QFII投资A股走过20年,外资驶向何方?
206-20 11:46第一财经北向资金抄底贵州<i>茅台</i> 游资联手封板中远海科丨龙虎榜
306-20 11:45第一财经22股获北向资金加仓超亿元
406-20 11:36第一财经北向资金抄底贵州<i>茅台</i> 游资联手封板中远海科丨龙虎榜
506-20 06:23第一财经每日早间精选热点新闻,点击「听新闻」,一键收听。第一财经布林肯结束访华,外交部美大司司长介绍情况;2023高考网上咨询周时间安排公布丨早报
6[第一财经]06-19 19:06今日股市0619丨50大跌小票指数强势 分化局面会否延续?
706-19 19:05第一财经今日股市0619丨50大跌小票指数强势 分化局面会否延续?
8[一财资讯]06-19 17:46净买入额居前三的是贵州<i>茅台</i>、药明康德、新易盛,分别获净买入3.48亿元、3.3...第一财经北向资金净卖出14.47亿元,贵州<i>茅台</i>、药明康德等获加仓
906-19 15:39第一财经三大指数小幅收跌 TMT赛道持续大涨|尾市盘点
\n", + "
" + ], + "text/plain": [ + " author creationDate desc \\\n", + "0 06-21 11:41 \n", + "1 [周艾琳] 06-20 21:55 2003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内... \n", + "2 06-20 11:46 \n", + "3 06-20 11:45 \n", + "4 06-20 11:36 \n", + "5 06-20 06:23 第一财经每日早间精选热点新闻,点击「听新闻」,一键收听。 \n", + "6 [第一财经] 06-19 19:06 \n", + "7 06-19 19:05 \n", + "8 [一财资讯] 06-19 17:46 净买入额居前三的是贵州茅台、药明康德、新易盛,分别获净买入3.48亿元、3.3... \n", + "9 06-19 15:39 \n", + "\n", + " source title \n", + "0 第一财经 机构抄底超讯通信 游资封板北斗星通丨龙虎榜 \n", + "1 第一财经 QFII投资A股走过20年,外资驶向何方? \n", + "2 第一财经 北向资金抄底贵州茅台 游资联手封板中远海科丨龙虎榜 \n", + "3 第一财经 22股获北向资金加仓超亿元 \n", + "4 第一财经 北向资金抄底贵州茅台 游资联手封板中远海科丨龙虎榜 \n", + "5 第一财经 布林肯结束访华,外交部美大司司长介绍情况;2023高考网上咨询周时间安排公布丨早报 \n", + "6 今日股市0619丨50大跌小票指数强势 分化局面会否延续? \n", + "7 第一财经 今日股市0619丨50大跌小票指数强势 分化局面会否延续? \n", + "8 第一财经 北向资金净卖出14.47亿元,贵州茅台、药明康德等获加仓 \n", + "9 第一财经 三大指数小幅收跌 TMT赛道持续大涨|尾市盘点 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"author\", \"creationDate\", \"desc\" ,\"source\", \"title\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Investor Place" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.investorplace_streaming import InvestorPlace_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = InvestorPlace_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimeauthorsummary
0[Trillion-Dollar Tech: 3 Stocks Poised for Unp...Jun 19, 2023Faisal Humayun, InvestorPlace ContributorThese are the tech stocks to buy for multibagg...
1[Trillion-Dollar Tech: 3 Stocks Poised for Unp...Jun 22, 2023Chris MacDonald, InvestorPlace ContributorWarren Buffett is undoubtedly one of the great...
2[Invest Like a Billionaire: 3 Long-Term Stocks...Jun 18, 2023Joel Baglole, InvestorPlace ContributorWith markets now recovering from the downturn ...
3[Invest Like a Billionaire: 3 Long-Term Stocks...Jun 16, 2023Louis Navellier and the InvestorPlace Research...The best tech stocks to watch are involved in ...
4[3 Tech Titans Leading the Charge Toward $10 T...Jun 19, 2023Will Ashworth, InvestorPlace ContributorAvoiding bad stocks requires investors to get ...
5[3 Tech Titans Leading the Charge Toward $10 T...Jun 19, 2023Tyrik Torres, InvestorPlace ContributorWhile AI software companies tend to get more b...
6[7 Tech Stocks to Watch Out For in 2023 … and ...Jun 16, 2023Chris MacDonald, InvestorPlace ContributorMany long-term conservative investors pay atte...
7[7 Tech Stocks to Watch Out For in 2023 … and ...Jun 16, 2023Louis Navellier and the InvestorPlace Research...Every stock has its ups and downs, but reliabl...
8[3 Smart Takes on 3 Dumb Stocks]Jun 23, 2023Samuel O'Brient, InvestorPlace Financial News ...Even as tech stocks rally, short sellers are s...
9[3 Smart Takes on 3 Dumb Stocks]Jun 18, 2023Chris Markoch, InvestorPlace ContributorHere are seven high cash flow stocks that prov...
\n", + "
" + ], + "text/plain": [ + " title time \\\n", + "0 [Trillion-Dollar Tech: 3 Stocks Poised for Unp... Jun 19, 2023 \n", + "1 [Trillion-Dollar Tech: 3 Stocks Poised for Unp... Jun 22, 2023 \n", + "2 [Invest Like a Billionaire: 3 Long-Term Stocks... Jun 18, 2023 \n", + "3 [Invest Like a Billionaire: 3 Long-Term Stocks... Jun 16, 2023 \n", + "4 [3 Tech Titans Leading the Charge Toward $10 T... Jun 19, 2023 \n", + "5 [3 Tech Titans Leading the Charge Toward $10 T... Jun 19, 2023 \n", + "6 [7 Tech Stocks to Watch Out For in 2023 … and ... Jun 16, 2023 \n", + "7 [7 Tech Stocks to Watch Out For in 2023 … and ... Jun 16, 2023 \n", + "8 [3 Smart Takes on 3 Dumb Stocks] Jun 23, 2023 \n", + "9 [3 Smart Takes on 3 Dumb Stocks] Jun 18, 2023 \n", + "\n", + " author \\\n", + "0 Faisal Humayun, InvestorPlace Contributor \n", + "1 Chris MacDonald, InvestorPlace Contributor \n", + "2 Joel Baglole, InvestorPlace Contributor \n", + "3 Louis Navellier and the InvestorPlace Research... \n", + "4 Will Ashworth, InvestorPlace Contributor \n", + "5 Tyrik Torres, InvestorPlace Contributor \n", + "6 Chris MacDonald, InvestorPlace Contributor \n", + "7 Louis Navellier and the InvestorPlace Research... \n", + "8 Samuel O'Brient, InvestorPlace Financial News ... \n", + "9 Chris Markoch, InvestorPlace Contributor \n", + "\n", + " summary \n", + "0 These are the tech stocks to buy for multibagg... \n", + "1 Warren Buffett is undoubtedly one of the great... \n", + "2 With markets now recovering from the downturn ... \n", + "3 The best tech stocks to watch are involved in ... \n", + "4 Avoiding bad stocks requires investors to get ... \n", + "5 While AI software companies tend to get more b... \n", + "6 Many long-term conservative investors pay atte... \n", + "7 Every stock has its ups and downs, but reliabl... \n", + "8 Even as tech stocks rally, short sellers are s... \n", + "9 Here are seven high cash flow stocks that prov... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\" ,\"author\", \"summary\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Guru Focus" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.gurufocus_streaming import GuruFocus_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support first page now!\n" + ] + } + ], + "source": [ + "news_downloader = GuruFocus_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"AAPL\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleviewsourcedatetime
03 Magic Formula Stocks Popular With Gurus0 ViewsMargaret Moran2023-06-23 17:38
1Jeremy Grantham: The Super Bubble Is About to Pop60 ViewsBen Alaimo2023-06-23 09:21
25 High GF Score Stocks That Outperformed the M...106 ViewsJames Li2023-06-21 19:36
3New Feature: DuPont Analysis Chart for Enhance...259 ViewsVera Yuan2023-06-21 16:55
4The Most-Sold Guru Stocks of the 1st Quarter261 ViewsMargaret Moran2023-06-16 17:32
5AI Revolution and Debt Ceiling Resolution198 ViewsWade W. Slome, CFA, CFP2023-06-05 21:03
6Nvidia vs. ARK Invest: Which Is the Better Gro...332 ViewsJoey Frenette2023-05-27 02:05
7Top 5 1st Quarter Trades of CYPRESS ASSET MANA...0 ViewsGuruFocus Editor2023-05-26 14:08
8Mill Creek Capital Advisors, LLC Buys 2, Sells...0 ViewsGuruFocus Editor2023-05-25 18:10
9Jim Simons' Renaissance Technologies Chops Pos...380 ViewsJames Li2023-05-24 18:43
\n", + "
" + ], + "text/plain": [ + " title view \\\n", + "0 3 Magic Formula Stocks Popular With Gurus 0 Views \n", + "1 Jeremy Grantham: The Super Bubble Is About to Pop 60 Views \n", + "2 5 High GF Score Stocks That Outperformed the M... 106 Views \n", + "3 New Feature: DuPont Analysis Chart for Enhance... 259 Views \n", + "4 The Most-Sold Guru Stocks of the 1st Quarter 261 Views \n", + "5 AI Revolution and Debt Ceiling Resolution 198 Views \n", + "6 Nvidia vs. ARK Invest: Which Is the Better Gro... 332 Views \n", + "7 Top 5 1st Quarter Trades of CYPRESS ASSET MANA... 0 Views \n", + "8 Mill Creek Capital Advisors, LLC Buys 2, Sells... 0 Views \n", + "9 Jim Simons' Renaissance Technologies Chops Pos... 380 Views \n", + "\n", + " source datetime \n", + "0 Margaret Moran 2023-06-23 17:38 \n", + "1 Ben Alaimo 2023-06-23 09:21 \n", + "2 James Li 2023-06-21 19:36 \n", + "3 Vera Yuan 2023-06-21 16:55 \n", + "4 Margaret Moran 2023-06-16 17:32 \n", + "5 Wade W. Slome, CFA, CFP 2023-06-05 21:03 \n", + "6 Joey Frenette 2023-05-27 02:05 \n", + "7 GuruFocus Editor 2023-05-26 14:08 \n", + "8 GuruFocus Editor 2023-05-25 18:10 \n", + "9 James Li 2023-05-24 18:43 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"view\" ,\"source\", \"datetime\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Alliance News" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.alliancenews_streaming import AllianceNews_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "news_downloader = AllianceNews_Streaming()\n", + "news_downloader.download_streaming_search(rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(36, 16)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlIdtitlesummarycreatedupdatedthumbnailUrlsourcetaxonomiestypeauthormetasponsorparentcontentIddisplayTaxonomiesparentTaxonomy
0/news/new-york-market-close-stocks-down-dollar...NEW YORK MARKET CLOSE: Stocks down, dollar up ...None2023-06-23T21:18:342023-06-23T21:18:34None{'code': 'ALLIANCE', 'title': 'Alliance News',...[{'termId': 'CTMRR', 'parentTermId': None, 'ti...newsNone{'title': 'NEW YORK MARKET CLOSE: Stocks down,...NoneNoneal1687551514259519100[{'termId': 'TPCOM', 'parentTermId': 'PTMKT', ...None
\n", + "
" + ], + "text/plain": [ + " urlId \\\n", + "0 /news/new-york-market-close-stocks-down-dollar... \n", + "\n", + " title summary \\\n", + "0 NEW YORK MARKET CLOSE: Stocks down, dollar up ... None \n", + "\n", + " created updated thumbnailUrl \\\n", + "0 2023-06-23T21:18:34 2023-06-23T21:18:34 None \n", + "\n", + " source \\\n", + "0 {'code': 'ALLIANCE', 'title': 'Alliance News',... \n", + "\n", + " taxonomies type author \\\n", + "0 [{'termId': 'CTMRR', 'parentTermId': None, 'ti... news None \n", + "\n", + " meta sponsor parent \\\n", + "0 {'title': 'NEW YORK MARKET CLOSE: Stocks down,... None None \n", + "\n", + " contentId displayTaxonomies \\\n", + "0 al1687551514259519100 [{'termId': 'TPCOM', 'parentTermId': 'PTMKT', ... \n", + "\n", + " parentTaxonomy \n", + "0 None " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
createdupdatedtitlesummarymeta
02023-06-23T21:18:342023-06-23T21:18:34NEW YORK MARKET CLOSE: Stocks down, dollar up ...None{'title': 'NEW YORK MARKET CLOSE: Stocks down,...
12023-06-23T19:34:052023-06-23T19:34:05IN BRIEF: Blackstone Loan Financing proposes w...None{'title': 'IN BRIEF: Blackstone Loan Financing...
22023-06-23T18:34:412023-06-23T18:34:41IN BRIEF: Bonhill expects to complete sale of ...None{'title': 'IN BRIEF: Bonhill expects to comple...
32023-06-23T18:01:272023-06-23T18:01:27UPDATE: SRT Marine Systems raises GBP4.6 milli...None{'title': 'UPDATE: SRT Marine Systems raises G...
42023-06-23T18:00:272023-06-23T18:00:27IN BRIEF: New Energy One Acquisition confirms ...None{'title': 'IN BRIEF: New Energy One Acquisitio...
52023-06-23T17:41:152023-06-23T17:41:15IN BRIEF: Kropz makes draw down request on bri...None{'title': 'IN BRIEF: Kropz makes draw down req...
62023-06-23T17:31:172023-06-23T17:31:17IN BRIEF: XPS Pensions discusses National Pens...None{'title': 'IN BRIEF: XPS Pensions discusses Na...
72023-06-23T17:25:542023-06-23T17:25:54DIRECTOR DEALINGS: GSK CFO buys shares worth G...None{'title': 'DIRECTOR DEALINGS: GSK CFO buys sha...
82023-06-23T17:21:292023-06-23T17:21:29IN BRIEF: Gilead Sciences says test results sh...None{'title': 'IN BRIEF: Gilead Sciences says test...
92023-06-23T17:07:242023-06-23T17:07:24IN THE KNOW: AB Foods \"fundamentally strong\" w...None{'title': 'IN THE KNOW: AB Foods \"fundamentall...
\n", + "
" + ], + "text/plain": [ + " created updated \\\n", + "0 2023-06-23T21:18:34 2023-06-23T21:18:34 \n", + "1 2023-06-23T19:34:05 2023-06-23T19:34:05 \n", + "2 2023-06-23T18:34:41 2023-06-23T18:34:41 \n", + "3 2023-06-23T18:01:27 2023-06-23T18:01:27 \n", + "4 2023-06-23T18:00:27 2023-06-23T18:00:27 \n", + "5 2023-06-23T17:41:15 2023-06-23T17:41:15 \n", + "6 2023-06-23T17:31:17 2023-06-23T17:31:17 \n", + "7 2023-06-23T17:25:54 2023-06-23T17:25:54 \n", + "8 2023-06-23T17:21:29 2023-06-23T17:21:29 \n", + "9 2023-06-23T17:07:24 2023-06-23T17:07:24 \n", + "\n", + " title summary \\\n", + "0 NEW YORK MARKET CLOSE: Stocks down, dollar up ... None \n", + "1 IN BRIEF: Blackstone Loan Financing proposes w... None \n", + "2 IN BRIEF: Bonhill expects to complete sale of ... None \n", + "3 UPDATE: SRT Marine Systems raises GBP4.6 milli... None \n", + "4 IN BRIEF: New Energy One Acquisition confirms ... None \n", + "5 IN BRIEF: Kropz makes draw down request on bri... None \n", + "6 IN BRIEF: XPS Pensions discusses National Pens... None \n", + "7 DIRECTOR DEALINGS: GSK CFO buys shares worth G... None \n", + "8 IN BRIEF: Gilead Sciences says test results sh... None \n", + "9 IN THE KNOW: AB Foods \"fundamentally strong\" w... None \n", + "\n", + " meta \n", + "0 {'title': 'NEW YORK MARKET CLOSE: Stocks down,... \n", + "1 {'title': 'IN BRIEF: Blackstone Loan Financing... \n", + "2 {'title': 'IN BRIEF: Bonhill expects to comple... \n", + "3 {'title': 'UPDATE: SRT Marine Systems raises G... \n", + "4 {'title': 'IN BRIEF: New Energy One Acquisitio... \n", + "5 {'title': 'IN BRIEF: Kropz makes draw down req... \n", + "6 {'title': 'IN BRIEF: XPS Pensions discusses Na... \n", + "7 {'title': 'DIRECTOR DEALINGS: GSK CFO buys sha... \n", + "8 {'title': 'IN BRIEF: Gilead Sciences says test... \n", + "9 {'title': 'IN THE KNOW: AB Foods \"fundamentall... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"created\", \"updated\", \"title\", \"summary\", \"meta\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Talk Market" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.talkmarkets_streaming import TalkMarkets_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = TalkMarkets_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60, 12)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cacheUrlclicktrackUrlcontentcontentNoFormattingtitletitleNoFormattingformattedUrlunescapedUrlurlvisibleUrlrichSnippetbreadcrumbUrl
0http://www.google.com/search?q=cache:PUjyIRJA8...https://www.google.com/url?client=internal-ele...23 hours ago <b>...</b> <b>Apple</b>, 187, 3.0...23 hours ago ... Apple, 187, 3.04, 1.65%, 187....Equitymaster India | Sensex Today Trades Lower...Equitymaster India | Sensex Today Trades Lower...https://talkmarkets.com/.../sensex-today-trade...https://talkmarkets.com/content/global-markets...https://talkmarkets.com/content/global-markets...talkmarkets.com{'cseImage': {'src': 'https://www.eqimg.com/im...{'host': 'talkmarkets.com', 'crumbs': ['sensex...
\n", + "
" + ], + "text/plain": [ + " cacheUrl \\\n", + "0 http://www.google.com/search?q=cache:PUjyIRJA8... \n", + "\n", + " clicktrackUrl \\\n", + "0 https://www.google.com/url?client=internal-ele... \n", + "\n", + " content \\\n", + "0 23 hours ago ... Apple, 187, 3.0... \n", + "\n", + " contentNoFormatting \\\n", + "0 23 hours ago ... Apple, 187, 3.04, 1.65%, 187.... \n", + "\n", + " title \\\n", + "0 Equitymaster India | Sensex Today Trades Lower... \n", + "\n", + " titleNoFormatting \\\n", + "0 Equitymaster India | Sensex Today Trades Lower... \n", + "\n", + " formattedUrl \\\n", + "0 https://talkmarkets.com/.../sensex-today-trade... \n", + "\n", + " unescapedUrl \\\n", + "0 https://talkmarkets.com/content/global-markets... \n", + "\n", + " url visibleUrl \\\n", + "0 https://talkmarkets.com/content/global-markets... talkmarkets.com \n", + "\n", + " richSnippet \\\n", + "0 {'cseImage': {'src': 'https://www.eqimg.com/im... \n", + "\n", + " breadcrumbUrl \n", + "0 {'host': 'talkmarkets.com', 'crumbs': ['sensex... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contenturlclicktrackUrl
023 hours ago <b>...</b> <b>Apple</b>, 187, 3.0...https://talkmarkets.com/content/global-markets...https://www.google.com/url?client=internal-ele...
11 day ago <b>...</b> Get Adobe Inc. (ADBE:NASD...https://talkmarkets.com/symbol/adbe/portal-wid...https://www.google.com/url?client=internal-ele...
21 day ago <b>...</b> Get Starbucks Corp (SBUX:...https://talkmarkets.com/symbol/sbux/portal-wid...https://www.google.com/url?client=internal-ele...
310 hours ago <b>...</b> Wednesday&#39;s top an...https://talkmarkets.com/symbol/pypl/portal-wid...https://www.google.com/url?client=internal-ele...
420 hours ago <b>...</b> <b>Apple</b> (AAPL). <...https://talkmarkets.com/content/stocks--equiti...https://www.google.com/url?client=internal-ele...
52 days ago <b>...</b> Friday&#39;s top analyst...https://talkmarkets.com/symbol/sofi/portal-wid...https://www.google.com/url?client=internal-ele...
62 days ago <b>...</b> Get Enphase Energy Inc (...https://talkmarkets.com/symbol/enph/portal-wid...https://www.google.com/url?client=internal-ele...
73 days ago <b>...</b> <b>Apple</b> Inc. design...https://talkmarkets.com/contributor/jimvanmeer...https://www.google.com/url?client=internal-ele...
83 days ago <b>...</b> <b>Apple</b> Inc. design...https://talkmarkets.com/content/stocks--equiti...https://www.google.com/url?client=internal-ele...
95 Jan 2023 <b>...</b> Get Amazon.com Inc (AMZN...https://talkmarkets.com/symbol/amzn/portal-wid...https://www.google.com/url?client=internal-ele...
\n", + "
" + ], + "text/plain": [ + " content \\\n", + "0 23 hours ago ... Apple, 187, 3.0... \n", + "1 1 day ago ... Get Adobe Inc. (ADBE:NASD... \n", + "2 1 day ago ... Get Starbucks Corp (SBUX:... \n", + "3 10 hours ago ... Wednesday's top an... \n", + "4 20 hours ago ... Apple (AAPL). <... \n", + "5 2 days ago ... Friday's top analyst... \n", + "6 2 days ago ... Get Enphase Energy Inc (... \n", + "7 3 days ago ... Apple Inc. design... \n", + "8 3 days ago ... Apple Inc. design... \n", + "9 5 Jan 2023 ... Get Amazon.com Inc (AMZN... \n", + "\n", + " url \\\n", + "0 https://talkmarkets.com/content/global-markets... \n", + "1 https://talkmarkets.com/symbol/adbe/portal-wid... \n", + "2 https://talkmarkets.com/symbol/sbux/portal-wid... \n", + "3 https://talkmarkets.com/symbol/pypl/portal-wid... \n", + "4 https://talkmarkets.com/content/stocks--equiti... \n", + "5 https://talkmarkets.com/symbol/sofi/portal-wid... \n", + "6 https://talkmarkets.com/symbol/enph/portal-wid... \n", + "7 https://talkmarkets.com/contributor/jimvanmeer... \n", + "8 https://talkmarkets.com/content/stocks--equiti... \n", + "9 https://talkmarkets.com/symbol/amzn/portal-wid... \n", + "\n", + " clicktrackUrl \n", + "0 https://www.google.com/url?client=internal-ele... \n", + "1 https://www.google.com/url?client=internal-ele... \n", + "2 https://www.google.com/url?client=internal-ele... \n", + "3 https://www.google.com/url?client=internal-ele... \n", + "4 https://www.google.com/url?client=internal-ele... \n", + "5 https://www.google.com/url?client=internal-ele... \n", + "6 https://www.google.com/url?client=internal-ele... \n", + "7 https://www.google.com/url?client=internal-ele... \n", + "8 https://www.google.com/url?client=internal-ele... \n", + "9 https://www.google.com/url?client=internal-ele... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"content\", \"url\", \"clicktrackUrl\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The Fly" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.thefly_streaming import TheFly_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\ProgramData\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'thefly.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = TheFly_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"AAPL\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlestockabstractdatetime
0Apple in talks to launch Apple Card in India, ...AAPLApple is in talks to…06/23/2305:37
1Apple says visionOS software development kit n...AAPLApple announced the…06/21/2316:03
2Apple to create spatial experiences for Apple ...AAPLApple \"announced the…06/21/2316:00
3Notable open interest changes for June 21stTSLA NVDA AAPL AMZNTuesday's total…06/21/2308:55
4What You Missed This Week in Video GamesTCEHY TTWO EA CCOEY UBSFY CMCSK CMCSA RBLX AAP...\"Game On\" is The Fly's…06/20/2312:11
5Notable open interest changes for June 20thTSLA AMC AAPL NVDAFriday's total…06/20/2308:55
6Apple call buyer realizes 20% same-day gainsAAPLNotable profits for the…06/16/2308:00
7Notable open interest changes for June 15thTSLA NVDA SOFI AAPLWednesday's total…06/15/2308:55
8US senators propose bill to eliminate Section ...GOOG MSFT AMZN AAPL NVDA IBM META INTC\"U.S. Senators Josh…06/14/2317:54
9#SocialStocks: Twitter skips out on rent and G...TWTR AAPL META GOOGL GOOG ZM RBLX PINSWelcome to…06/14/2315:57
\n", + "
" + ], + "text/plain": [ + " title ... time\n", + "0 Apple in talks to launch Apple Card in India, ... ... 05:37\n", + "1 Apple says visionOS software development kit n... ... 16:03\n", + "2 Apple to create spatial experiences for Apple ... ... 16:00\n", + "3 Notable open interest changes for June 21st ... 08:55\n", + "4 What You Missed This Week in Video Games ... 12:11\n", + "5 Notable open interest changes for June 20th ... 08:55\n", + "6 Apple call buyer realizes 20% same-day gains ... 08:00\n", + "7 Notable open interest changes for June 15th ... 08:55\n", + "8 US senators propose bill to eliminate Section ... ... 17:54\n", + "9 #SocialStocks: Twitter skips out on rent and G... ... 15:57\n", + "\n", + "[10 rows x 5 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"stock\", \"abstract\", \"date\", \"time\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tip Rank" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.tipranks_streaming import TipRanks_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading: 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = TipRanks_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stocks_idauthorcategorydatedescriptionimageisLockedlinklockTypeslugstickythumbnailtitletopicstimeAgobadgeid
0[{'ticker': 'AAPL', 'market': None}]582530{'slug': 'steveanderson'}{'slug': 'news', 'title': 'Market News'}2023-06-22T19:52:41.000Z<p>Those who regularly follow Apple stock (NAS...{'src': 'https://blog.tipranks.com/wp-content/...Truehttps://www.tipranks.com/news/aapl-notches-up-...GraceCountaapl-notches-up-following-barclays-commentsFalse{'src': 'https://blog.tipranks.com/wp-content/...AAPL Notches Up Following Barclays Comments[{'id': 0, 'type': 'stock', 'title': 'AAPL', '...13hNone582530
1[{'ticker': 'AAPL', 'market': None}, {'ticker'...579043{'slug': 'amit-singh'}{'slug': 'article', 'title': 'Stock Analysis &...2023-06-19T15:30:38.000Z<p>Affirm Holdings (NASDAQ:AFRM) stock recover...{'src': 'https://blog.tipranks.com/wp-content/...Truehttps://www.tipranks.com/news/article/affirm-s...GraceCountaffirm-stock-has-risen-swiftly-will-it-beat-ap...False{'src': 'https://blog.tipranks.com/wp-content/...Affirm Stock Has Risen Swiftly. Will It Beat A...[{'id': 0, 'type': 'stock', 'title': 'AAPL', '...4dNone579043
\n", + "
" + ], + "text/plain": [ + " stocks _id \\\n", + "0 [{'ticker': 'AAPL', 'market': None}] 582530 \n", + "1 [{'ticker': 'AAPL', 'market': None}, {'ticker'... 579043 \n", + "\n", + " author \\\n", + "0 {'slug': 'steveanderson'} \n", + "1 {'slug': 'amit-singh'} \n", + "\n", + " category \\\n", + "0 {'slug': 'news', 'title': 'Market News'} \n", + "1 {'slug': 'article', 'title': 'Stock Analysis &... \n", + "\n", + " date \\\n", + "0 2023-06-22T19:52:41.000Z \n", + "1 2023-06-19T15:30:38.000Z \n", + "\n", + " description \\\n", + "0

Those who regularly follow Apple stock (NAS... \n", + "1

Affirm Holdings (NASDAQ:AFRM) stock recover... \n", + "\n", + " image isLocked \\\n", + "0 {'src': 'https://blog.tipranks.com/wp-content/... True \n", + "1 {'src': 'https://blog.tipranks.com/wp-content/... True \n", + "\n", + " link lockType \\\n", + "0 https://www.tipranks.com/news/aapl-notches-up-... GraceCount \n", + "1 https://www.tipranks.com/news/article/affirm-s... GraceCount \n", + "\n", + " slug sticky \\\n", + "0 aapl-notches-up-following-barclays-comments False \n", + "1 affirm-stock-has-risen-swiftly-will-it-beat-ap... False \n", + "\n", + " thumbnail \\\n", + "0 {'src': 'https://blog.tipranks.com/wp-content/... \n", + "1 {'src': 'https://blog.tipranks.com/wp-content/... \n", + "\n", + " title \\\n", + "0 AAPL Notches Up Following Barclays Comments \n", + "1 Affirm Stock Has Risen Swiftly. Will It Beat A... \n", + "\n", + " topics timeAgo badge id \n", + "0 [{'id': 0, 'type': 'stock', 'title': 'AAPL', '... 13h None 582530 \n", + "1 [{'id': 0, 'type': 'stock', 'title': 'AAPL', '... 4d None 579043 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stocksdateauthortitledescription
0[{'ticker': 'AAPL', 'market': None}]2023-06-22T19:52:41.000Z{'slug': 'steveanderson'}AAPL Notches Up Following Barclays Comments<p>Those who regularly follow Apple stock (NAS...
1[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-19T15:30:38.000Z{'slug': 'amit-singh'}Affirm Stock Has Risen Swiftly. Will It Beat A...<p>Affirm Holdings (NASDAQ:AFRM) stock recover...
2[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-16T20:51:11.000Z{'slug': 'joey-frenette'}Apple Stock (NASDAQ:AAPL): Expectations Too Mo...<p>Apple (NASDAQ:AAPL) stock recently hit a ne...
3[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-16T08:32:40.000Z{'slug': 'sheryl-sheth'}Lost the Nvidia and Apple Boom? Microsoft (NAS...<p>Think you lost the chance to become wealthy...
4[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-15T19:20:48.000Z{'slug': 'joey-frenette'}Unity Software (NASDAQ:U): Apple Vision Pro Pa...<p>Unity Software (NASDAQ:U) rallied 17% when ...
5[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-15T00:53:16.000Z{'slug': 'joey-frenette'}Investing in Apple’s (NASDAQ:AAPL) Ecosystem: ...<p>Apple (NASDAQ:AAPL) has been on an unbeliev...
6[{'ticker': 'AAPL', 'market': None}]2023-06-13T15:13:40.000Z{'slug': 'vince-condarcuri'}AAPL Stock Slips after Analyst Downgrade<p>Despite outperforming the S&amp;P 500 with ...
7[{'ticker': 'AAPL', 'market': None}]2023-06-10T15:09:23.000Z{'slug': 'martyshtrubel'}Apple Stock Gets a New Street-High Price Target<p>Apple’s (NASDAQ:AAPL) virtual reality and a...
8[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-09T09:46:28.000Z{'slug': 'amit-singh'}NVDA to META: Insiders Capitalise on Tech Stoc...<p>Technology stocks rebounded strongly in 202...
9[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-08T18:20:20.000Z{'slug': 'michaelbyrne'}Apple Stock is on Fire. Invest in it with Thes...<p>Apple (NASDAQ:AAPL) stock is off to a gain ...
\n", + "
" + ], + "text/plain": [ + " stocks \\\n", + "0 [{'ticker': 'AAPL', 'market': None}] \n", + "1 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "2 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "3 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "4 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "5 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "6 [{'ticker': 'AAPL', 'market': None}] \n", + "7 [{'ticker': 'AAPL', 'market': None}] \n", + "8 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "9 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "\n", + " date author \\\n", + "0 2023-06-22T19:52:41.000Z {'slug': 'steveanderson'} \n", + "1 2023-06-19T15:30:38.000Z {'slug': 'amit-singh'} \n", + "2 2023-06-16T20:51:11.000Z {'slug': 'joey-frenette'} \n", + "3 2023-06-16T08:32:40.000Z {'slug': 'sheryl-sheth'} \n", + "4 2023-06-15T19:20:48.000Z {'slug': 'joey-frenette'} \n", + "5 2023-06-15T00:53:16.000Z {'slug': 'joey-frenette'} \n", + "6 2023-06-13T15:13:40.000Z {'slug': 'vince-condarcuri'} \n", + "7 2023-06-10T15:09:23.000Z {'slug': 'martyshtrubel'} \n", + "8 2023-06-09T09:46:28.000Z {'slug': 'amit-singh'} \n", + "9 2023-06-08T18:20:20.000Z {'slug': 'michaelbyrne'} \n", + "\n", + " title \\\n", + "0 AAPL Notches Up Following Barclays Comments \n", + "1 Affirm Stock Has Risen Swiftly. Will It Beat A... \n", + "2 Apple Stock (NASDAQ:AAPL): Expectations Too Mo... \n", + "3 Lost the Nvidia and Apple Boom? Microsoft (NAS... \n", + "4 Unity Software (NASDAQ:U): Apple Vision Pro Pa... \n", + "5 Investing in Apple’s (NASDAQ:AAPL) Ecosystem: ... \n", + "6 AAPL Stock Slips after Analyst Downgrade \n", + "7 Apple Stock Gets a New Street-High Price Target \n", + "8 NVDA to META: Insiders Capitalise on Tech Stoc... \n", + "9 Apple Stock is on Fire. Invest in it with Thes... \n", + "\n", + " description \n", + "0

Those who regularly follow Apple stock (NAS... \n", + "1

Affirm Holdings (NASDAQ:AFRM) stock recover... \n", + "2

Apple (NASDAQ:AAPL) stock recently hit a ne... \n", + "3

Think you lost the chance to become wealthy... \n", + "4

Unity Software (NASDAQ:U) rallied 17% when ... \n", + "5

Apple (NASDAQ:AAPL) has been on an unbeliev... \n", + "6

Despite outperforming the S&P 500 with ... \n", + "7

Apple’s (NASDAQ:AAPL) virtual reality and a... \n", + "8

Technology stocks rebounded strongly in 202... \n", + "9

Apple (NASDAQ:AAPL) stock is off to a gain ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"stocks\", \"date\", \"author\", \"title\", \"description\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Market Watch (Date Range)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.marketwatch_date_range import MarketWatch_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2022-06-01\"\n", + "end_date = \"2022-06-30\"\n", + "keyword = \"apple\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = MarketWatch_Date_Range()\n", + "news_downloader.download_date_range_search(keyword = \"apple\", start_date = start_date, end_date = end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimeauthor
0Gold falls more than 2% for the month, settles...Jun. 30, 2022 at 2:47 p.m. ETby Joseph Adinolfi
1AMD stock gets an upgrade as analyst says rece...Jun. 30, 2022 at 12:07 p.m. ETby Emily Bary
2All 30 Dow stocks are falling, with Goldman Sa...Jun. 30, 2022 at 9:47 a.m. ETby Tomi Kilgore
3Here’s how far oil could fall in a recession, ...Jun. 30, 2022 at 8:34 a.m. ETby Steve Goldstein
4Crypto Winter Is Coming After SEC Rejects Key ...Jun. 30, 2022 at 6:37 a.m. ET
5An FCC Commissioner Wants TikTok Yanked From A...Jun. 30, 2022 at 3:27 a.m. ETby Barron's
6Meta Has a New Problem. Profit Forecasts Now L...Jun. 29, 2022 at 1:18 p.m. ETby Barron's
7Fed rolls out new index to flag early warning ...Jun. 29, 2022 at 1:04 p.m. ETby Joy Wiltermuth
8Apple Investors Have Something New to Focus On...Jun. 29, 2022 at 12:41 p.m. ETby Barron's
9Here’s why this trader is piling back into one...Jun. 29, 2022 at 10:34 a.m. ETby Barbara Kollmeyer
\n", + "
" + ], + "text/plain": [ + " title \\\n", + "0 Gold falls more than 2% for the month, settles... \n", + "1 AMD stock gets an upgrade as analyst says rece... \n", + "2 All 30 Dow stocks are falling, with Goldman Sa... \n", + "3 Here’s how far oil could fall in a recession, ... \n", + "4 Crypto Winter Is Coming After SEC Rejects Key ... \n", + "5 An FCC Commissioner Wants TikTok Yanked From A... \n", + "6 Meta Has a New Problem. Profit Forecasts Now L... \n", + "7 Fed rolls out new index to flag early warning ... \n", + "8 Apple Investors Have Something New to Focus On... \n", + "9 Here’s why this trader is piling back into one... \n", + "\n", + " time author \n", + "0 Jun. 30, 2022 at 2:47 p.m. ET by Joseph Adinolfi \n", + "1 Jun. 30, 2022 at 12:07 p.m. ET by Emily Bary \n", + "2 Jun. 30, 2022 at 9:47 a.m. ET by Tomi Kilgore \n", + "3 Jun. 30, 2022 at 8:34 a.m. ET by Steve Goldstein \n", + "4 Jun. 30, 2022 at 6:37 a.m. ET \n", + "5 Jun. 30, 2022 at 3:27 a.m. ET by Barron's \n", + "6 Jun. 29, 2022 at 1:18 p.m. ET by Barron's \n", + "7 Jun. 29, 2022 at 1:04 p.m. ET by Joy Wiltermuth \n", + "8 Jun. 29, 2022 at 12:41 p.m. ET by Barron's \n", + "9 Jun. 29, 2022 at 10:34 a.m. ET by Barbara Kollmeyer " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\", \"author\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Market Watch (Streaming)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.marketwatch_streaming import MarketWatch_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = MarketWatch_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimeauthor
0Tech IPOs Should Be Heating Up. Why They’re Not.Jun. 23, 2023 at 2:51 a.m. ETby Barron's
1Everything Is Going Right for Tesla. It’s Time...Jun. 23, 2023 at 1:30 a.m. ETby Barron's
2India’s Modi cracks jokes, chows down at swank...Jun. 22, 2023 at 11:38 p.m. ETby Associated Press
3Work-From-Home Job Openings Are ShrinkingJun. 22, 2023 at 6:23 p.m. ETby Barron's
4Congress Blasts E-Commerce Firm Temu Over Forc...Jun. 22, 2023 at 5:44 p.m. ETby Barron's
5Meta Platforms Inc. stock outperforms market o...Jun. 22, 2023 at 5:32 p.m. ETby MarketWatch Automation
6Microsoft Corp. stock outperforms competitors ...Jun. 22, 2023 at 5:32 p.m. ETby MarketWatch Automation
7Netflix Inc. stock underperforms Thursday when...Jun. 22, 2023 at 5:32 p.m. ETby MarketWatch Automation
8GameStop Corp. Cl A stock underperforms Thursd...Jun. 22, 2023 at 5:29 p.m. ETby MarketWatch Automation
9Amazon.com Inc. stock outperforms market on st...Jun. 22, 2023 at 5:25 p.m. ETby MarketWatch Automation
\n", + "
" + ], + "text/plain": [ + " title \\\n", + "0 Tech IPOs Should Be Heating Up. Why They’re Not. \n", + "1 Everything Is Going Right for Tesla. It’s Time... \n", + "2 India’s Modi cracks jokes, chows down at swank... \n", + "3 Work-From-Home Job Openings Are Shrinking \n", + "4 Congress Blasts E-Commerce Firm Temu Over Forc... \n", + "5 Meta Platforms Inc. stock outperforms market o... \n", + "6 Microsoft Corp. stock outperforms competitors ... \n", + "7 Netflix Inc. stock underperforms Thursday when... \n", + "8 GameStop Corp. Cl A stock underperforms Thursd... \n", + "9 Amazon.com Inc. stock outperforms market on st... \n", + "\n", + " time author \n", + "0 Jun. 23, 2023 at 2:51 a.m. ET by Barron's \n", + "1 Jun. 23, 2023 at 1:30 a.m. ET by Barron's \n", + "2 Jun. 22, 2023 at 11:38 p.m. ET by Associated Press \n", + "3 Jun. 22, 2023 at 6:23 p.m. ET by Barron's \n", + "4 Jun. 22, 2023 at 5:44 p.m. ET by Barron's \n", + "5 Jun. 22, 2023 at 5:32 p.m. ET by MarketWatch Automation \n", + "6 Jun. 22, 2023 at 5:32 p.m. ET by MarketWatch Automation \n", + "7 Jun. 22, 2023 at 5:32 p.m. ET by MarketWatch Automation \n", + "8 Jun. 22, 2023 at 5:29 p.m. ET by MarketWatch Automation \n", + "9 Jun. 22, 2023 at 5:25 p.m. ET by MarketWatch Automation " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\", \"author\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Penny Stock" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.pennystocks_streaming import PennyStocks_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requesting https://pennystocks.com ... succeed!\n", + "Gathering again .. Remaining Retry: 4\n", + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = PennyStocks_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimebriefreading_time
0Best Penny Stocks to Buy Ahead Of Apple’s Even...September 14, 2021\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat do penny stock in...5 minute read
0What Could The Apple Event Mean For Penny Stoc...October 13, 2020\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWill The Apple Event M...5 minute read
03 Red Hot Penny Stocks To Watch Before Next We...June 14, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch.\\n4 minute read
0Penny Stocks Definition & 7 Trading Strategies...June 8, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat Is A Penny Stock?...6 minute read
0Best Penny Stocks To Buy? 5 With Big News This...June 7, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch ...4 minute read
0Penny Stocks & The Stock Market Today: Top Tre...May 30, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat happened in the s...6 minute read
0Penny Stocks To Buy? 3 AI Stocks To Watch Righ...May 30, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tAI penny stocks to wat...5 minute read
0What Are Penny Stocks & Should You Buy Them In...May 19, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tShould You Find Penny ...6 minute read
0Trading Penny Stocks: 3 High-Growth Industries...May 8, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWatch these three indu...7 minute read
0Fed Meeting Live Updates: 10 Takeaways From Ma...May 3, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tFOMC Statement From Ma...10 minute read
\n", + "
" + ], + "text/plain": [ + " title time \\\n", + "0 Best Penny Stocks to Buy Ahead Of Apple’s Even... September 14, 2021 \n", + "0 What Could The Apple Event Mean For Penny Stoc... October 13, 2020 \n", + "0 3 Red Hot Penny Stocks To Watch Before Next We... June 14, 2023 \n", + "0 Penny Stocks Definition & 7 Trading Strategies... June 8, 2023 \n", + "0 Best Penny Stocks To Buy? 5 With Big News This... June 7, 2023 \n", + "0 Penny Stocks & The Stock Market Today: Top Tre... May 30, 2023 \n", + "0 Penny Stocks To Buy? 3 AI Stocks To Watch Righ... May 30, 2023 \n", + "0 What Are Penny Stocks & Should You Buy Them In... May 19, 2023 \n", + "0 Trading Penny Stocks: 3 High-Growth Industries... May 8, 2023 \n", + "0 Fed Meeting Live Updates: 10 Takeaways From Ma... May 3, 2023 \n", + "\n", + " brief reading_time \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat do penny stock in... 5 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWill The Apple Event M... 5 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch.\\n 4 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat Is A Penny Stock?... 6 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch ... 4 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat happened in the s... 6 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tAI penny stocks to wat... 5 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tShould You Find Penny ... 6 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWatch these three indu... 7 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tFOMC Statement From Ma... 10 minute read " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\", \"brief\", \"reading_time\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Seeking Alpha" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.seekingalpha_date_range import SeekingAlpha_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2023-06-01\"\n", + "end_date = \"2023-06-30\"\n", + "stock = \"AAPL\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading Titles: 100%|██████████| 1/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
publishOntitlecommentCount
02023-06-19T09:00:00-04:00Artificial intelligence is a '1995 moment' for...63
12023-06-16T11:59:46-04:00Citi: Don't worry about Big Tech fueling 2023'...17
22023-06-15T07:20:12-04:00Google said to temper chatbot use for employee...8
32023-06-14T05:17:31-04:00Nvidia crosses $1T market cap powered by the r...36
42023-06-13T17:17:00-04:00Intel in talks to be anchor investor in chip d...94
52023-06-13T12:14:45-04:00Hot Stocks: AAPL falls on downgrade; MANU rise...3
62023-06-13T06:22:59-04:00Apple notches record close as bulls continue t...18
72023-06-13T04:49:26-04:00Apple cut to Neutral at UBS on softer iPhone a...26
82023-06-09T05:42:17-04:00AI looking like a 'winner-take-more' game - Go...39
92023-06-09T05:21:04-04:00Zuckerberg's vision for AR/VR headsets differe...92
\n", + "" + ], + "text/plain": [ + " publishOn \\\n", + "0 2023-06-19T09:00:00-04:00 \n", + "1 2023-06-16T11:59:46-04:00 \n", + "2 2023-06-15T07:20:12-04:00 \n", + "3 2023-06-14T05:17:31-04:00 \n", + "4 2023-06-13T17:17:00-04:00 \n", + "5 2023-06-13T12:14:45-04:00 \n", + "6 2023-06-13T06:22:59-04:00 \n", + "7 2023-06-13T04:49:26-04:00 \n", + "8 2023-06-09T05:42:17-04:00 \n", + "9 2023-06-09T05:21:04-04:00 \n", + "\n", + " title commentCount \n", + "0 Artificial intelligence is a '1995 moment' for... 63 \n", + "1 Citi: Don't worry about Big Tech fueling 2023'... 17 \n", + "2 Google said to temper chatbot use for employee... 8 \n", + "3 Nvidia crosses $1T market cap powered by the r... 36 \n", + "4 Intel in talks to be anchor investor in chip d... 94 \n", + "5 Hot Stocks: AAPL falls on downgrade; MANU rise... 3 \n", + "6 Apple notches record close as bulls continue t... 18 \n", + "7 Apple cut to Neutral at UBS on softer iPhone a... 26 \n", + "8 AI looking like a 'winner-take-more' game - Go... 39 \n", + "9 Zuckerberg's vision for AR/VR headsets differe... 92 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"publishOn\",\"title\",\"commentCount\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reuters" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.reuters_streaming import Reuters_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Geting pages: 1 2 3 " + ] + } + ], + "source": [ + "news_downloader = Reuters_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
published_timetitledescription
02023-06-19T10:17:24.474ZHong Kong tycoon Jimmy Lai loses appeal agains...A Hong Kong appeal court on Monday blocked jai...
12023-06-19T09:49:09.391ZPodcast: Blinken meets Xi and Chinese bankers ...U.S. Secretary of State Anthony Blinken is in ...
22023-06-19T02:10:24.581ZTheir parents made China the world's factory. ...When Steven Du took over his parents' factory ...
32023-06-17T08:14:15.708ZJapan to open up Apple- and Google-dominated p...Japan plans to stoke competition in smartphone...
42023-06-16T13:28:02.538ZKhashoggi's widow sues Israeli spyware company...The widow of murdered Saudi journalist Jamal K...
52023-06-16T11:12:02.899ZFactbox: DLE companies racing to reshape globa...Lithium, the metal used to make electric vehic...
62023-06-16T10:17:23.831ZPodcast: US-Iran talks and Australia divided o...Australia is divided on a historic referendum ...
72023-06-16T02:16:44.523ZChinese e-commerce giants entice cautious cons...China's e-commerce platforms are competing fie...
82023-06-15T10:21:02.697ZS&P 500 leaps to highest close in 14 months; t...The S&P 500 and Nasdaq surged on Thursday to c...
92023-06-15T19:49:27.459ZMicrosoft notches record high valuation of nea...Microsoft Corp shares rose to a new record hig...
\n", + "
" + ], + "text/plain": [ + " published_time \\\n", + "0 2023-06-19T10:17:24.474Z \n", + "1 2023-06-19T09:49:09.391Z \n", + "2 2023-06-19T02:10:24.581Z \n", + "3 2023-06-17T08:14:15.708Z \n", + "4 2023-06-16T13:28:02.538Z \n", + "5 2023-06-16T11:12:02.899Z \n", + "6 2023-06-16T10:17:23.831Z \n", + "7 2023-06-16T02:16:44.523Z \n", + "8 2023-06-15T10:21:02.697Z \n", + "9 2023-06-15T19:49:27.459Z \n", + "\n", + " title \\\n", + "0 Hong Kong tycoon Jimmy Lai loses appeal agains... \n", + "1 Podcast: Blinken meets Xi and Chinese bankers ... \n", + "2 Their parents made China the world's factory. ... \n", + "3 Japan to open up Apple- and Google-dominated p... \n", + "4 Khashoggi's widow sues Israeli spyware company... \n", + "5 Factbox: DLE companies racing to reshape globa... \n", + "6 Podcast: US-Iran talks and Australia divided o... \n", + "7 Chinese e-commerce giants entice cautious cons... \n", + "8 S&P 500 leaps to highest close in 14 months; t... \n", + "9 Microsoft notches record high valuation of nea... \n", + "\n", + " description \n", + "0 A Hong Kong appeal court on Monday blocked jai... \n", + "1 U.S. Secretary of State Anthony Blinken is in ... \n", + "2 When Steven Du took over his parents' factory ... \n", + "3 Japan plans to stoke competition in smartphone... \n", + "4 The widow of murdered Saudi journalist Jamal K... \n", + "5 Lithium, the metal used to make electric vehic... \n", + "6 Australia is divided on a historic referendum ... \n", + "7 China's e-commerce platforms are competing fie... \n", + "8 The S&P 500 and Nasdaq surged on Thursday to c... \n", + "9 Microsoft Corp shares rose to a new record hig... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"published_time\",\"title\",\"description\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sina Finance" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.sina_finance_date_range import Sina_Finance_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2016-01-01\"\n", + "end_date = \"2016-01-01\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:04<00:00, 1.05it/s]\n", + "Checking ips: 100%|██████████| 75/75 [00:20<00:00, 3.67it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 75。Usable proxy ips: 75.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading Titles...: 100%|██████████| 1/1 [00:01<00:00, 1.54s/it]\n", + "Gathering news contents: 100%|██████████| 103/103 [00:22<00:00, 4.50it/s]\n" + ] + } + ], + "source": [ + "news_downloader = Sina_Finance_Date_Range(config)\n", + "news_downloader.download_date_range_all(start_date,end_date)\n", + "news_downloader.gather_content()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlecontent
0分析师:伊朗重回国际原油市场无法阻止新浪美股讯 北京时间1月1日晚CNBC称,加拿大皇家银行(RBC)分析师Helima Cro...
1FAA:波音767的逃生扶梯存在缺陷新浪美股讯 北京时间1日晚,美国联邦航空局(FAA)要求航空公司对波音767机型的救生扶梯进...
2非制造业新订单指数创新高 需求回升力度明显中新社北京1月1日电 (记者 刘长忠)记者1日从中国物流与采购联合会获悉,在最新发布的201...
3雷曼兄弟针对大和证券提起索赔诉讼新浪美股讯 北京时间1日下午共同社称,2008年破产的美国金融巨头雷曼兄弟公司的清算法人日前...
4国内钢铁PMI有所回升 钢市低迷形势有所改善新华社上海1月1日专电(记者李荣)据中物联钢铁物流专业委员会1日发布的指数报告,2015年1...
5马息岭凸显朝鲜旅游体育战略新浪美股北京时间1日讯 三位单板滑雪手将成为最早拜访马息岭滑雪场的西方专业运动员,他们本月就...
6五洲船舶破产清算 近十年来首现国有船厂倒闭(原标题:中国首家国有船厂破产倒闭)\\n低迷的中国造船市场,多年来首次出现国有船厂破产清算的...
7过半城市房价环比上涨 百城住宅均价加速升温资料图。中新社记者 武俊杰 摄\\n中新社北京1月1日电 (记者 庞无忌)中国房地产市场在20...
8经济学人:巴西病根到底在哪里新浪美股北京时间1日讯 原本,巴西人是该高高兴兴迎接2016年的。8月间,里约热内卢将举办南...
9中国首家国有船厂破产倒闭:五洲船舶目前已停工低迷的中国造船市场,多年来首次出现国有船厂破产清算的一幕。浙江海运集团旗下的五洲船舶修造公司...
\n", + "
" + ], + "text/plain": [ + " title content\n", + "0 分析师:伊朗重回国际原油市场无法阻止 新浪美股讯 北京时间1月1日晚CNBC称,加拿大皇家银行(RBC)分析师Helima Cro...\n", + "1 FAA:波音767的逃生扶梯存在缺陷 新浪美股讯 北京时间1日晚,美国联邦航空局(FAA)要求航空公司对波音767机型的救生扶梯进...\n", + "2 非制造业新订单指数创新高 需求回升力度明显 中新社北京1月1日电 (记者 刘长忠)记者1日从中国物流与采购联合会获悉,在最新发布的201...\n", + "3 雷曼兄弟针对大和证券提起索赔诉讼 新浪美股讯 北京时间1日下午共同社称,2008年破产的美国金融巨头雷曼兄弟公司的清算法人日前...\n", + "4 国内钢铁PMI有所回升 钢市低迷形势有所改善 新华社上海1月1日专电(记者李荣)据中物联钢铁物流专业委员会1日发布的指数报告,2015年1...\n", + "5 马息岭凸显朝鲜旅游体育战略 新浪美股北京时间1日讯 三位单板滑雪手将成为最早拜访马息岭滑雪场的西方专业运动员,他们本月就...\n", + "6 五洲船舶破产清算 近十年来首现国有船厂倒闭 (原标题:中国首家国有船厂破产倒闭)\\n低迷的中国造船市场,多年来首次出现国有船厂破产清算的...\n", + "7 过半城市房价环比上涨 百城住宅均价加速升温 资料图。中新社记者 武俊杰 摄\\n中新社北京1月1日电 (记者 庞无忌)中国房地产市场在20...\n", + "8 经济学人:巴西病根到底在哪里 新浪美股北京时间1日讯 原本,巴西人是该高高兴兴迎接2016年的。8月间,里约热内卢将举办南...\n", + "9 中国首家国有船厂破产倒闭:五洲船舶目前已停工 低迷的中国造船市场,多年来首次出现国有船厂破产清算的一幕。浙江海运集团旗下的五洲船舶修造公司..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"content\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Eastmoney" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.eastmoney_streaming import Eastmoney_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "stock = \"600519\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:04<00:00, 1.08it/s]\n", + "Checking ips: 100%|██████████| 75/75 [00:20<00:00, 3.62it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 75。Usable proxy ips: 75.\n", + "Geting pages: 0 1 2 Get total 3 pages.\n" + ] + } + ], + "source": [ + "news_downloader = Eastmoney_Streaming(config)\n", + "news_downloader.download_streaming_stock(stock,pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
read amountcommentstitlecontent linkauthorcreate time
014076茅台2022年报的12个小秘密/news,600519,1295554981.html贵州茅台资讯04-09 19:40
12340东北证券维持贵州茅台买入评级 预计2023年净利润同比/news,600519,1295512910.html公司研报提示04-09 11:24
23850贵州茅台:融资余额169.34亿元,创近一年新低(04-07/news,600519,1295407809.html贵州茅台资讯04-08 07:30
32330贵州茅台:融资净买入1248.48万元,融资余额169.79亿/news,600519,1294929438.html贵州茅台资讯04-07 07:28
4280416贵州茅台公益基金会正式成立/news,600519,1294612056.html贵州茅台资讯04-06 12:29
53330贵州茅台04月04日获沪股通增持19.55万股/news,600519,1294268016.html贵州茅台资讯04-05 07:48
63120贵州茅台:融资余额169.66亿元,创近一年新低(04-04/news,600519,1294265710.html贵州茅台资讯04-05 07:30
722721164月4日北向资金最新动向(附十大成交股)/news,600519,1294192188.html贵州茅台资讯04-04 18:48
86541大宗交易:贵州茅台成交235.9万元,成交价1814.59元(/news,600519,1294173281.html贵州茅台资讯04-04 17:21
92330第一上海证券维持贵州茅台买入评级 目标价2428.8元/news,600519,1293784734.html公司研报提示04-04 09:30
\n", + "
" + ], + "text/plain": [ + " read amount comments title \\\n", + "0 1407 6 茅台2022年报的12个小秘密 \n", + "1 234 0 东北证券维持贵州茅台买入评级 预计2023年净利润同比 \n", + "2 385 0 贵州茅台:融资余额169.34亿元,创近一年新低(04-07 \n", + "3 233 0 贵州茅台:融资净买入1248.48万元,融资余额169.79亿 \n", + "4 2804 16 贵州茅台公益基金会正式成立 \n", + "5 333 0 贵州茅台04月04日获沪股通增持19.55万股 \n", + "6 312 0 贵州茅台:融资余额169.66亿元,创近一年新低(04-04 \n", + "7 22721 16 4月4日北向资金最新动向(附十大成交股) \n", + "8 654 1 大宗交易:贵州茅台成交235.9万元,成交价1814.59元( \n", + "9 233 0 第一上海证券维持贵州茅台买入评级 目标价2428.8元 \n", + "\n", + " content link author create time \n", + "0 /news,600519,1295554981.html 贵州茅台资讯 04-09 19:40 \n", + "1 /news,600519,1295512910.html 公司研报提示 04-09 11:24 \n", + "2 /news,600519,1295407809.html 贵州茅台资讯 04-08 07:30 \n", + "3 /news,600519,1294929438.html 贵州茅台资讯 04-07 07:28 \n", + "4 /news,600519,1294612056.html 贵州茅台资讯 04-06 12:29 \n", + "5 /news,600519,1294268016.html 贵州茅台资讯 04-05 07:48 \n", + "6 /news,600519,1294265710.html 贵州茅台资讯 04-05 07:30 \n", + "7 /news,600519,1294192188.html 贵州茅台资讯 04-04 18:48 \n", + "8 /news,600519,1294173281.html 贵州茅台资讯 04-04 17:21 \n", + "9 /news,600519,1293784734.html 公司研报提示 04-04 09:30 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"create time\"]\n", + "news_downloader[selected_columns].dataframe.head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finnhub / Yahoo" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.finnhub_date_range import Finnhub_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2023-01-01\"\n", + "end_date = \"2023-01-03\"\n", + "config = {\n", + " \"use_proxy\": \"us_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + " \"token\": \"YOUR_FINNHUB_TOKEN\" # Avaliable at https://finnhub.io/dashboard\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Checking ips: 100%|██████████| 75/75 [02:51<00:00, 2.28s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get proxy ips: 75.\n", + "Usable proxy ips: 75.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading Titles: 100%|██████████| 1/1 [00:02<00:00, 2.66s/it]\n", + "Gathering news contents: 48%|████▊ | 49/102 [03:18<02:51, 3.24s/it]c:\\Users\\Olive\\.conda\\envs\\finrl\\lib\\site-packages\\urllib3\\connectionpool.py:1052: InsecureRequestWarning: Unverified HTTPS request is being made to host 'thefly.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", + " InsecureRequestWarning,\n", + "Gathering news contents: 100%|██████████| 102/102 [06:15<00:00, 3.68s/it]\n" + ] + } + ], + "source": [ + "news_downloader = Finnhub_Date_Range(config)\n", + "news_downloader.download_date_range_stock(start_date,end_date)\n", + "news_downloader.gather_content()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorydatetimeheadlineidimagerelatedsourcesummaryurlcontent
0company2023-01-03 23:40:08My 26-Stock $349k Portfolio Gets A Nice Petrob...118107004https://media.gettyimages.com/id/1441204186/ph...AAPLSeekingAlphaMy portfolio, built specifically for my retire...https://finnhub.io/api/news?id=d3c15f6f365663b...Home\\nInvesting Strategy\\nPortfolio Strategy\\n...
1company2023-01-03 22:09:00Apple’s Market Cap Slides Below $2 Trillion fo...118105849AAPLYahooThe tech giant is one of only five U.S. compan...https://finnhub.io/api/news?id=42343678a7474e1...Error
\n", + "
" + ], + "text/plain": [ + " category datetime \\\n", + "0 company 2023-01-03 23:40:08 \n", + "1 company 2023-01-03 22:09:00 \n", + "\n", + " headline id \\\n", + "0 My 26-Stock $349k Portfolio Gets A Nice Petrob... 118107004 \n", + "1 Apple’s Market Cap Slides Below $2 Trillion fo... 118105849 \n", + "\n", + " image related source \\\n", + "0 https://media.gettyimages.com/id/1441204186/ph... AAPL SeekingAlpha \n", + "1 AAPL Yahoo \n", + "\n", + " summary \\\n", + "0 My portfolio, built specifically for my retire... \n", + "1 The tech giant is one of only five U.S. compan... \n", + "\n", + " url \\\n", + "0 https://finnhub.io/api/news?id=d3c15f6f365663b... \n", + "1 https://finnhub.io/api/news?id=42343678a7474e1... \n", + "\n", + " content \n", + "0 Home\\nInvesting Strategy\\nPortfolio Strategy\\n... \n", + "1 Error " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = news_downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
headlinecontent
0My 26-Stock $349k Portfolio Gets A Nice Petrob...Home\\nInvesting Strategy\\nPortfolio Strategy\\n...
1Apple’s Market Cap Slides Below $2 Trillion fo...Error
2US STOCKS-Wall St starts the year with a dip; ...(For a Reuters live blog on U.S., UK and Europ...
3Buy 4 January Dogs Of The Dow, Watch 4 MoreHome\\nDividends\\nDividend Quick Picks\\nBuy 4 J...
4Apple's stock market value falls below $2 tril...Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...
5CORRECTED-UPDATE 1-Apple's stock market value ...Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...
6Apple Stock Falls Amid Report Of Product Order...Apple stock got off to a slow start in 2023 as...
7US STOCKS-Wall St starts the year with a dip; ...Summary\\nCompanies\\nTesla shares plunge on Q4 ...
8More than $1 trillion wiped off value of Apple...apple store\\nMore than $1 trillion has been wi...
9McLean's Iridium inks agreement to put its sat...The company hasn't named its partner, but it's...
\n", + "
" + ], + "text/plain": [ + " headline \\\n", + "0 My 26-Stock $349k Portfolio Gets A Nice Petrob... \n", + "1 Apple’s Market Cap Slides Below $2 Trillion fo... \n", + "2 US STOCKS-Wall St starts the year with a dip; ... \n", + "3 Buy 4 January Dogs Of The Dow, Watch 4 More \n", + "4 Apple's stock market value falls below $2 tril... \n", + "5 CORRECTED-UPDATE 1-Apple's stock market value ... \n", + "6 Apple Stock Falls Amid Report Of Product Order... \n", + "7 US STOCKS-Wall St starts the year with a dip; ... \n", + "8 More than $1 trillion wiped off value of Apple... \n", + "9 McLean's Iridium inks agreement to put its sat... \n", + "\n", + " content \n", + "0 Home\\nInvesting Strategy\\nPortfolio Strategy\\n... \n", + "1 Error \n", + "2 (For a Reuters live blog on U.S., UK and Europ... \n", + "3 Home\\nDividends\\nDividend Quick Picks\\nBuy 4 J... \n", + "4 Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto... \n", + "5 Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto... \n", + "6 Apple stock got off to a slow start in 2023 as... \n", + "7 Summary\\nCompanies\\nTesla shares plunge on Q4 ... \n", + "8 apple store\\nMore than $1 trillion has been wi... \n", + "9 The company hasn't named its partner, but it's... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"headline\", \"content\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_SEC_Filings.ipynb b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_SEC_Filings.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..115a8069335890d32f8c4c6d427a5f892202eef1 --- /dev/null +++ b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_SEC_Filings.ipynb @@ -0,0 +1,234 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chaning the directory for access to finnlp" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/athekunal/OSS Finance/FinNLP\n" + ] + } + ], + "source": [ + "import os\n", + "os.chdir('../../../..')\n", + "# print(os.getcwd())" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.sec_filings import SECFilingsLoader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The SEC Downloader expects 5 attributes\n", + "\n", + "* tickers: It is a list of valid tickers\n", + "* amount: Number of documents that you want to download\n", + "* filing_type: 10-K or 10-Q filing type\n", + "* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker\n", + "* include_amends: To include amendments or not." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "sec_data = SECFilingsLoader(\n", + " ['AAPL'],1,'10-K',include_amends=True,num_workers=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Started for AAPL\n", + "Done for AAPL for document 10-K and year 2023\n", + "It took 5.18 seconds\n" + ] + } + ], + "source": [ + "sec_data.load_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It will download the data in the following directories and sub-directories\n", + "\n", + "```python\n", + "- AAPL\n", + " - 2018\n", + " - 10-K.json\n", + " - 2019\n", + " - 10-K.json\n", + " - 2020\n", + " - 10-K.json\n", + " - 2021\n", + " - 10-K.json\n", + " - 10-Q_12.json\n", + " - 2022\n", + " - 10-K.json\n", + " - 10-Q_03.json\n", + " - 10-Q_06.json\n", + " - 10-Q_12.json\n", + " - 2023\n", + " - 10-Q_04.json\n", + "- GOOGL\n", + " - 2018\n", + " - 10-K.json\n", + " - 2019\n", + " - 10-K.json\n", + " - 2020\n", + " - 10-K.json\n", + " - 2021\n", + " - 10-K.json\n", + " - 10-Q_09.json\n", + " - 2022\n", + " - 10-K.json\n", + " - 10-Q_03.json\n", + " - 10-Q_06.json\n", + " - 10-Q_09.json\n", + " - 2023\n", + " - 10-Q_03.json\n", + "- TSLA\n", + " - 2018\n", + " - 10-K.json\n", + " - 2019\n", + " - 10-K.json\n", + " - 2020\n", + " - 10-K.json\n", + " - 2021\n", + " - 10-K.json\n", + " - 10-KA.json\n", + " - 10-Q_09.json\n", + " - 2022\n", + " - 10-K.json\n", + " - 10-Q_03.json\n", + " - 10-Q_06.json\n", + " - 10-Q_09.json\n", + " - 2023\n", + " - 10-Q_03.json\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## READ DATA\n", + "Below we have section-wise separated text data for a given ticker in a given year" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open('data/AAPL/2023/10-K.json', 'r') as f:\n", + " data = json.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'year': '2023',\n", + " 'ticker': 'AAPL',\n", + " 'all_texts': {'BUSINESS': 'To remain competitive and stimulate customer demand, the Company must successfully manage frequent introductions and transitions of products and services. Due to the highly volatile and competitive nature of the markets and industries in which the Company competes, the Company must continually introduce new products, services and technologies, enhance existing products and services, effectively stimulate customer demand for new and upgraded products and services, and successfully manage the transition to these new and upgraded products and services. The success of new product and service introductions depends on a number of factors, including timely and successful development, market acceptance, the Company’s ability to manage the risks associated with new technologies and production ramp-up issues, the availability of application software for the Company’s products, the effective management of purchase commitments and inventory levels in line with anticipated product demand, the availability of products in appropriate quantities and at expected costs to meet anticipated demand, and the risk that new products and services may have quality or other defects or deficiencies. There can be no assurance the Company will successfully manage future introductions and transitions of products and services. The Company depends on component and product manufacturing and logistical services provided by outsourcing partners, many of which are located outside of the U.S. Substantially all of the Company’s manufacturing is performed in whole or in part by outsourcing partners located primarily in China mainland, India, Japan, South Korea, Taiwan and Vietnam, and a significant concentration of this manufacturing is currently performed by a small number of outsourcing partners, often in single locations. Changes or additions to the Company’s supply chain require considerable time and resources and involve significant risks and uncertainties. The Company has also outsourced much of its transportation and logistics management. While these arrangements can lower operating costs, they also reduce the Company’s direct control over production and distribution. Such diminished control has from time to time and may in the future have an adverse effect on the quality or quantity of products manufactured or services provided, or adversely affect the Company’s flexibility to respond to changing conditions. Although arrangements with these partners may contain provisions for product defect expense reimbursement, the Company generally remains responsible to the consumer for warranty and out-of-warranty service in the event of product defects and experiences unanticipated product defect liabilities from time to time. While the Company relies on its partners to adhere to its supplier code of conduct, violations of the supplier code of conduct occur from time to time and can materially adversely affect the Company’s business, reputation, results of operations and financial condition.',\n", + " 'RISK_FACTORS': 'The Company’s business, reputation, results of operations, financial condition and stock price can be affected by a number of factors, whether currently known or unknown, including those described below. When any one or more of these risks materialize from time to time, the Company’s business, reputation, results of operations, financial condition and stock price can be materially and adversely affected. Because of the following factors, as well as other factors affecting the Company’s results of operations and financial condition, past financial performance should not be considered to be a reliable indicator of future performance, and investors should not use historical trends to anticipate results or trends in future periods. This discussion of risk factors contains forward-looking statements. This section should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the consolidated financial statements and accompanying notes in Part II, Item 8, “Financial Statements and Supplementary Data” of this Form 10-K. The Company’s operations and performance depend significantly on global and regional economic conditions and adverse economic conditions can materially adversely affect the Company’s business, results of operations and financial condition. The Company has international operations with sales outside the U.S. representing a majority of the Company’s total net sales. In addition, the Company’s global supply chain is large and complex and a majority of the Company’s supplier facilities, including manufacturing and assembly sites, are located outside the U.S. As a result, the Company’s operations and performance depend significantly on global and regional economic conditions. Adverse macroeconomic conditions, including slow growth or recession, high unemployment, inflation, tighter credit, higher interest rates, and currency fluctuations, can adversely impact consumer confidence and spending and materially adversely affect demand for the Company’s products and services. In addition, consumer confidence and spending can be materially adversely affected in response to changes in fiscal and monetary policy, financial market volatility, declines in income or asset values, and other economic factors. In addition to an adverse impact on demand for the Company’s products and services, uncertainty about, or a decline in, global or regional economic conditions can have a significant impact on the Company’s suppliers, contract manufacturers, logistics providers, distributors, cellular network carriers and other channel partners, and developers. Potential outcomes include financial instability; inability to obtain credit to finance business operations; and insolvency. Adverse economic conditions can also lead to increased credit and collectibility risk on the Company’s trade receivables; the failure of derivative counterparties and other financial institutions; limitations on the Company’s ability to issue new debt; reduced liquidity; and declines in the fair values of the Company’s financial instruments. These and other impacts can materially adversely affect the Company’s business, results of operations, financial condition and stock price. The Company’s business can be impacted by political events, trade and other international disputes, war, terrorism, natural disasters, public health issues, industrial accidents and other business interruptions. Political events, trade and other international disputes, war, terrorism, natural disasters, public health issues, industrial accidents and other business interruptions can harm or disrupt international commerce and the global economy, and could have a material adverse effect on the Company and its customers, suppliers, contract manufacturers, logistics providers, distributors, cellular network carriers and other channel partners. The Company has a large, global business with sales outside the U.S. representing a majority of the Company’s total net sales, and the Company believes that it generally benefits from growth in international trade. Substantially all of the Company’s manufacturing is performed in whole or in part by outsourcing partners located primarily in China mainland, India, Japan, South Korea, Taiwan and Vietnam. Restrictions on international trade, such as tariffs and other controls on imports or exports of goods, technology or data, can materially adversely affect the Company’s operations and supply chain and limit the Company’s ability to offer and distribute its products and services to customers. The impact can be particularly significant if these restrictive measures apply to countries and regions where the Company derives a significant portion of its revenues and/or has significant supply chain operations. Restrictive measures can require the Company to take various actions, including changing suppliers, restructuring business relationships, and ceasing to offer third-party applications on its platforms. Changing the Company’s operations in accordance with new or changed restrictions on international trade can be expensive, time-consuming and disruptive to the Company’s operations. Such restrictions can be announced with little or no advance notice and the Company may not be able to effectively mitigate all adverse impacts from such measures. For example, tensions between governments, including the U.S. and China, have in the past led to tariffs and other restrictions being imposed on the Company’s business. If disputes and conflicts further escalate in the future, actions by governments in response could be significantly more severe and restrictive and could materially adversely affect the Company’s business. Political uncertainty surrounding trade and other international disputes could also have a negative effect on consumer confidence and spending, which could adversely affect the Company’s business. Many of the Company’s operations and facilities, as well as critical business operations of the Company’s suppliers and contract manufacturers, are in locations that are prone to earthquakes and other natural disasters. In addition, such operations and facilities are subject to the risk of interruption by fire, power shortages, nuclear power plant accidents and other industrial accidents, terrorist attacks and other hostile acts, ransomware and other cybersecurity attacks, labor disputes, public health issues, including pandemics such as the COVID-19 pandemic, and other events beyond the Company’s control. Global climate change is resulting in certain types of natural disasters, such as droughts, floods, hurricanes and wildfires, occurring more frequently or with more intense effects. Such events can make it difficult or impossible for the Company to manufacture and deliver products to its customers, create delays and inefficiencies in the Company’s supply and manufacturing chain, and result in slowdowns and outages to the Company’s service offerings, and negatively impact consumer spending and demand in affected areas. Following an interruption to its business, the Company can require substantial recovery time, experience significant expenditures to resume operations, and lose significant sales. Because the Company relies on single or limited sources for the supply and manufacture of many critical components, a business interruption affecting such sources would exacerbate any negative consequences to the Company. The Company’s operations are also subject to the risks of industrial accidents at its suppliers and contract manufacturers. While the Company’s suppliers are required to maintain safe working environments and operations, an industrial accident could occur and could result in serious injuries or loss of life, disruption to the Company’s business, and harm to the Company’s reputation. Major public health issues, including pandemics such as the COVID-19 pandemic, have adversely affected, and could in the future materially adversely affect, the Company due to their impact on the global economy and demand for consumer products; the imposition of protective public safety measures, such as stringent employee travel restrictions and limitations on freight services and the movement of products between regions; and disruptions in the Company’s operations, supply chain and sales and distribution channels, resulting in interruptions to the supply of current products and offering of existing services, and delays in production ramps of new products and development of new services. While the Company maintains insurance coverage for certain types of losses, such insurance coverage may be insufficient to cover all losses that may arise. Global markets for the Company’s products and services are highly competitive and subject to rapid technological change, and the Company may be unable to compete effectively in these markets. The Company’s products and services are offered in highly competitive global markets characterized by aggressive price competition and resulting downward pressure on gross margins, frequent introduction of new products and services, short product life cycles, evolving industry standards, continual improvement in product price and performance characteristics, rapid adoption of technological advancements by competitors, and price sensitivity on the part of consumers and businesses. The Company’s ability to compete successfully depends heavily on ensuring the continuing and timely introduction of innovative new products, services and technologies to the marketplace. The Company designs and develops nearly the entire solution for its products, including the hardware, operating system, numerous software applications and related services. As a result, the Company must make significant investments in R&D. There can be no assurance these investments will achieve expected returns, and the Company may not be able to develop and market new products and services successfully. The Company currently holds a significant number of patents, trademarks and copyrights and has registered, and applied to register, additional patents, trademarks and copyrights. In contrast, many of the Company’s competitors seek to compete primarily through aggressive pricing and very low cost structures, and by imitating the Company’s products and infringing on its\\xa0intellectual property. Effective intellectual property protection is not consistently available in every country in which the Company operates. If the Company is unable to continue to develop and sell innovative new products with attractive margins or if competitors infringe on the Company’s intellectual property, the Company’s ability to maintain a competitive advantage could be materially adversely affected. The Company has a minority market share in the global smartphone, personal computer and tablet markets. The Company faces substantial competition in these markets from companies that have significant technical, marketing, distribution and other resources, as well as established hardware, software and digital content supplier relationships. In addition, some of the Company’s competitors have broader product lines, lower-priced products and a larger installed base of active devices. Competition has been particularly intense as competitors have aggressively cut prices and lowered product margins. Certain competitors have the resources, experience or cost structures to provide products at little or no profit or even at a loss. Some of the markets in which the Company competes have from time to time experienced little to no growth or contracted overall. Additionally, the Company faces significant competition as competitors imitate the Company’s product features and applications within their products or collaborate to offer solutions that are more competitive than those they currently offer. The Company also expects competition to intensify as competitors imitate the Company’s approach to providing components seamlessly within their offerings or work collaboratively to offer integrated solutions. The Company’s services also face substantial competition, including from companies that have significant resources and experience and have established service offerings with large customer bases. The Company competes with business models that provide content to users for free. The Company also competes with illegitimate means to obtain third-party digital content and applications. The Company’s business, results of operations and financial condition depend substantially on the Company’s ability to continually improve its products and services to maintain their functional and design advantages. There can be no assurance the Company will be able to continue to provide products and services that compete effectively. To remain competitive and stimulate customer demand, the Company must successfully manage frequent introductions and transitions of products and services. Due to the highly volatile and competitive nature of the markets and industries in which the Company competes, the Company must continually introduce new products, services and technologies, enhance existing products and services, effectively stimulate customer demand for new and upgraded products and services, and successfully manage the transition to these new and upgraded products and services. The success of new product and service introductions depends on a number of factors, including timely and successful development, market acceptance, the Company’s ability to manage the risks associated with new technologies and production ramp-up issues, the availability of application software for the Company’s products, the effective management of purchase commitments and inventory levels in line with anticipated product demand, the availability of products in appropriate quantities and at expected costs to meet anticipated demand, and the risk that new products and services may have quality or other defects or deficiencies. There can be no assurance the Company will successfully manage future introductions and transitions of products and services. The Company depends on component and product manufacturing and logistical services provided by outsourcing partners, many of which are located outside of the U.S. Substantially all of the Company’s manufacturing is performed in whole or in part by outsourcing partners located primarily in China mainland, India, Japan, South Korea, Taiwan and Vietnam, and a significant concentration of this manufacturing is currently performed by a small number of outsourcing partners, often in single locations. Changes or additions to the Company’s supply chain require considerable time and resources and involve significant risks and uncertainties. The Company has also outsourced much of its transportation and logistics management. While these arrangements can lower operating costs, they also reduce the Company’s direct control over production and distribution. Such diminished control has from time to time and may in the future have an adverse effect on the quality or quantity of products manufactured or services provided, or adversely affect the Company’s flexibility to respond to changing conditions. Although arrangements with these partners may contain provisions for product defect expense reimbursement, the Company generally remains responsible to the consumer for warranty and out-of-warranty service in the event of product defects and experiences unanticipated product defect liabilities from time to time. While the Company relies on its partners to adhere to its supplier code of conduct, violations of the supplier code of conduct occur from time to time and can materially adversely affect the Company’s business, reputation, results of operations and financial condition. The Company relies on single-source outsourcing partners in the U.S., Asia and Europe to supply and manufacture many components, and on outsourcing partners primarily located in Asia, for final assembly of substantially all of the Company’s hardware products. Any failure of these partners to perform can have a negative impact on the Company’s cost or supply of components or finished goods. In addition, manufacturing or logistics in these locations or transit to final destinations can be disrupted for a variety of reasons, including natural and man-made disasters, information technology system failures, commercial disputes, armed conflict, economic, business, labor, environmental, public health or political issues, or international trade disputes. The Company has invested in manufacturing process equipment, much of which is held at certain of its outsourcing partners, and has made prepayments to certain of its suppliers associated with long-term supply agreements. While these arrangements help ensure the supply of components and finished goods, if these outsourcing partners or suppliers experience severe financial problems or other disruptions in their business, such continued supply can be reduced or terminated, and the recoverability of manufacturing process equipment or prepayments can be negatively impacted. Future operating results depend upon the Company’s ability to obtain components in sufficient quantities on commercially reasonable terms. Because the Company currently obtains certain components from single or limited sources, the Company is subject to significant supply and pricing risks. Many components, including those that are available from multiple sources, are at times subject to industry-wide shortages and significant commodity pricing fluctuations that can materially adversely affect the Company’s business, results of operations and financial condition. For example, the global semiconductor industry has in the past experienced high demand and shortages of supply, which adversely affected the Company’s ability to obtain sufficient quantities of components and products on commercially reasonable terms or at all. Such disruptions could occur in the future. While the Company has entered into agreements for the supply of many components, there can be no assurance the Company will be able to extend or renew these agreements on similar terms, or at all. Component suppliers may suffer from poor financial conditions, which can lead to business failure for the supplier or consolidation within a particular industry, further limiting the Company’s ability to obtain sufficient quantities of components on commercially reasonable terms or at all. The effects of global or regional economic conditions on the Company’s suppliers, described in “ The Company’s operations and performance depend significantly on global and regional economic conditions and adverse economic conditions can materially adversely affect the Company’s business, results of operations and financial condition, ” above, can also affect the Company’s ability to obtain components Therefore, the Company remains subject to significant risks of supply shortages and price increases that can materially adversely affect its business, results of operations and financial condition. The Company’s new products often utilize custom components available from only one source. When a component or product uses new technologies, initial capacity constraints may exist until the suppliers’ yields have matured or their manufacturing capacities have increased. The continued availability of these components at acceptable prices, or at all, can be affected for any number of reasons, including if suppliers decide to concentrate on the production of common components instead of components customized to meet the Company’s requirements. When the Company’s supply of components for a new or existing product has been delayed or constrained, or when an outsourcing partner has delayed shipments of completed products to the Company, the Company’s business, results of operations and financial condition have been adversely affected and future delays or constraints could materially adversely affect the Company’s business, results of operations and financial condition. The Company’s business and financial performance could also be materially adversely affected depending on the time required to obtain sufficient quantities from the source, or to identify and obtain sufficient quantities from an alternative source. The Company’s products and services may be affected from time to time by design and manufacturing defects that could materially adversely affect the Company’s business and result in harm to the Company’s reputation. The Company offers complex hardware and software products and services that can be affected by design and manufacturing defects. Sophisticated operating system software and applications, such as those offered by the Company, often have issues that can unexpectedly interfere with the intended operation of hardware or software products and services. Defects can also exist in components and products the Company purchases from third parties. Component defects could make the Company’s products unsafe and create a risk of environmental or property damage and personal injury. These risks may increase as the Company’s products are introduced into specialized applications, including health. In addition, the Company’s service offerings can have quality issues and from time to time experience outages, service slowdowns or errors. As a result, from time to time the Company’s services have not performed as anticipated and may not meet customer expectations. There can be no assurance the Company will be able to detect and fix all issues and defects in the hardware, software and services it offers. Failure to do so can result in widespread technical and performance issues affecting the Company’s products and services. In addition, the Company can be exposed to product liability claims, recalls, product replacements or modifications, write-offs of inventory, property, plant and equipment or intangible assets, and significant warranty and other expenses, including litigation costs and regulatory fines. Quality problems can also adversely affect the experience for users of the Company’s products and services, and result in harm to the Company’s reputation, loss of competitive advantage, poor market acceptance, reduced demand for products and services, delay in new product and service introductions and lost sales. The Company is exposed to the risk of write-downs on the value of its inventory and other assets, in addition to purchase commitment cancellation risk. The Company records a write-down for product and component inventories that have become obsolete or exceed anticipated demand, or for which cost exceeds net realizable value. The Company also accrues necessary cancellation fee reserves for orders of excess products and components. The Company reviews long-lived assets, including capital assets held at its suppliers’ facilities and inventory prepayments, for impairment whenever events or circumstances indicate the assets may not be recoverable. If the Company determines that an impairment has occurred, it records a write-down equal to the amount by which the carrying value of the asset exceeds its fair value. Although the Company believes its inventory, capital assets, inventory prepayments and other assets and purchase commitments are currently recoverable, there can be no assurance the Company will not incur write-downs, fees, impairments and other charges given the rapid and unpredictable pace of product obsolescence in the industries in which the Company competes. The Company orders components for its products and builds inventory in advance of product announcements and shipments. Manufacturing purchase obligations cover the Company’s forecasted component and manufacturing requirements, typically for periods up to 150 days. Because the Company’s markets are volatile, competitive and subject to rapid technology and price changes, there is a risk the Company will forecast incorrectly and order or produce excess or insufficient amounts of components or products, or not fully utilize firm purchase commitments. The Company relies on access to third-party intellectual property, which may not be available to the Company on commercially reasonable terms or at all. The Company’s products and services are designed to include intellectual property owned by third parties, which requires licenses from those third parties. In addition, because of technological changes in the industries in which the Company currently competes or in the future may compete, current extensive patent coverage and the rapid rate of issuance of new patents, the Company’s products and services can unknowingly infringe existing patents or intellectual property rights of others. From time to time, the Company has been notified that it may be infringing certain patents or other intellectual property rights of third parties. Based on experience and industry practice, the Company believes licenses to such third-party intellectual property can generally be obtained on commercially reasonable terms. However, there can be no assurance the necessary licenses can be obtained on commercially reasonable terms or at all. Failure to obtain the right to use third-party intellectual property, or to use such intellectual property on commercially reasonable terms, can preclude the Company from selling certain products or services, or otherwise have a material adverse impact on the Company’s business, results of operations and financial condition. The Company’s future performance depends in part on support from third-party software developers. The Company believes decisions by customers to purchase its hardware products depend in part on the availability of third-party software applications and services. There can be no assurance third-party developers will continue to develop and maintain software applications and services for the Company’s products. If third-party software applications and services cease to be developed and maintained for the Company’s products, customers may choose not to buy the Company’s products. The Company believes the availability of third-party software applications and services for its products depends in part on the developers’ perception and analysis of the relative benefits of developing, maintaining and upgrading such software and services for the Company’s products compared to competitors’ platforms, such as Android for smartphones and tablets, Windows for personal computers and tablets, and PlayStation, Nintendo and Xbox for gaming platforms. This analysis may be based on factors such as the market position of the Company and its products, the anticipated revenue that may be generated, expected future growth of product sales, and the costs of developing such applications and services. The Company’s minority market share in the global smartphone, personal computer and tablet markets can make developers less inclined to develop or upgrade software for the Company’s products and more inclined to devote their resources to developing and upgrading software for competitors’ products with larger market share. When developers focus their efforts on these competing platforms, the availability and quality of applications for the Company’s devices can suffer. The Company relies on the continued availability and development of compelling and innovative software applications for its products. The Company’s products and operating systems are subject to rapid technological change, and when third-party developers are unable to or choose not to keep up with this pace of change, their applications can fail to take advantage of these changes to deliver improved customer experiences, can operate incorrectly, and can result in dissatisfied customers and lower customer demand for the Company’s products. The Company distributes third-party applications for its products through the App Store. For the vast majority of applications, developers keep all of the revenue they generate on the App Store. The Company retains a commission from sales of applications and sales of digital services or goods initiated within an application. From time to time, the Company has made changes to its App Store, including actions taken in response to competition, market conditions and legal and regulatory requirements. The Company expects to make further business changes in the future, including as a result of legislative initiatives impacting the App Store, such as the European Union (“EU”) Digital Markets Act, which the Company is required to comply with by March 2024. The Company is also subject to litigation and investigations relating to the App Store, which have resulted in changes to the Company’s business practices, and may in the future result in further changes. Changes have included how developers communicate with consumers outside the App Store regarding alternative purchasing mechanisms. Future changes could also affect what the Company charges developers for access to its platforms, how it manages distribution of apps outside of the App Store, and how and to what extent it allows developers to communicate with consumers inside the App Store regarding alternative purchasing mechanisms. This could reduce the volume of sales, and the commission that the Company earns on those sales, would decrease. If the rate of the commission that the Company retains on such sales is reduced, or if it is otherwise narrowed in scope or eliminated, the Company’s business, results of operations and financial condition could be materially adversely affected. Failure to obtain or create digital content that appeals to the Company’s customers, or to make such content available on commercially reasonable terms, could have a material adverse impact on the Company’s business, results of operations and financial condition. The Company contracts with numerous third parties to offer their digital content to customers. This includes the right to sell, or offer subscriptions to, third-party content, as well as the right to incorporate specific content into the Company’s own services. The licensing or other distribution arrangements for this content can be for relatively short time periods and do not guarantee the continuation or renewal of these arrangements on commercially reasonable terms, or at all. Some third-party content providers and distributors currently or in the future may offer competing products and services, and can take actions to make it difficult or impossible for the Company to license or otherwise distribute their content. Other content owners, providers or distributors may seek to limit the Company’s access to, or increase the cost of, such content. The Company may be unable to continue to offer a wide variety of content at commercially reasonable prices with acceptable usage rules. The Company also produces its own digital content, which can be costly to produce due to intense and increasing competition for talent, content and subscribers, and may fail to appeal to the Company’s customers. Some third-party digital content providers require the Company to provide digital rights management and other security solutions. If requirements change, the Company may have to develop or license new technology to provide these solutions. There can be no assurance the Company will be able to develop or license such solutions at a reasonable cost and in a timely manner. The Company’s success depends largely on the talents and efforts of its team members, the continued service and availability of highly skilled employees, including key personnel, and the Company’s ability to nurture its distinctive and inclusive culture. Much of the Company’s future success depends on the talents and efforts of its team members and the continued availability and service of key personnel, including its Chief Executive Officer, executive team and other highly skilled employees. Experienced personnel in the technology industry are in high demand and competition for their talents is intense, especially in Silicon Valley, where most of the Company’s key personnel are located. In addition to intense competition for talent, workforce dynamics are constantly evolving. If the Company does not manage changing workforce dynamics effectively, it could materially adversely affect the Company’s culture, reputation and operational flexibility. The Company believes that its distinctive and inclusive culture is a significant driver of its success. If the Company is unable to nurture its culture, it could materially adversely affect the Company’s ability to recruit and retain the highly skilled employees who are critical to its success, and could otherwise materially adversely affect the Company’s business, reputation, results of operations and financial condition. The Company depends on the performance of carriers, wholesalers, retailers and other resellers. The Company distributes its products and certain of its services through cellular network carriers, wholesalers, retailers and resellers, many of which distribute products and services from competitors. The Company also sells its products and services and resells third-party products in most of its major markets directly to consumers, small and mid-sized businesses, and education, enterprise and government customers through its retail and online stores and its direct sales force. Some carriers providing cellular network service for the Company’s products offer financing, installment payment plans or subsidies for users’ purchases of the device. There can be no assurance such offers will be continued at all or in the same amounts. The Company has invested and will continue to invest in programs to enhance reseller sales, including staffing selected resellers’ stores with Company employees and contractors, and improving product placement displays. These programs can require a substantial investment while not assuring return or incremental sales. The financial condition of these resellers could weaken, these resellers could stop distributing the Company’s products, or uncertainty regarding demand for some or all of the Company’s products could cause resellers to reduce their ordering and marketing of the Company’s products. The Company’s business and reputation are impacted by information technology system failures and network disruptions. The Company and its global supply chain are dependent on complex information technology systems and are exposed to information technology system failures or network disruptions caused by natural disasters, accidents, power disruptions, telecommunications failures, acts of terrorism or war, computer viruses, physical or electronic break-ins, ransomware or other cybersecurity incidents, or other events or disruptions. System upgrades, redundancy and other continuity measures may be ineffective or inadequate, and the Company’s or its vendors’ business continuity and disaster recovery planning may not be sufficient for all eventualities. Such failures or disruptions can adversely impact the Company’s business by, among other things, preventing access to the Company’s online services, interfering with customer transactions or impeding the manufacturing and shipping of the Company’s products. These events could materially adversely affect the Company’s business, reputation, results of operations and financial condition. Losses or unauthorized access to or releases of confidential information, including personal information, could subject the Company to significant reputational, financial, legal and operational consequences. The Company’s business requires it to use and store confidential information, including personal information, with respect to the Company’s customers and employees. The Company devotes significant resources to network and data security, including through the use of encryption and other security measures intended to protect its systems and data. But these measures cannot provide absolute security, and losses or unauthorized access to or releases of confidential information occur and could materially adversely affect the Company’s business, reputation, results of operations and financial condition. The Company’s business also requires it to share confidential information with suppliers and other third parties. The Company relies on global suppliers that are also exposed to ransomware and other malicious attacks that can disrupt business operations. Although the Company takes steps to secure confidential information that is provided to or accessible by third parties working on the Company’s behalf, such measures are not always effective and losses or unauthorized access to, or releases of, confidential information occur. Such incidents and other malicious attacks could materially adversely affect the Company’s business, reputation, results of operations and financial condition. The Company experiences malicious attacks and other attempts to gain unauthorized access to its systems on a regular basis. These attacks seek to compromise the confidentiality, integrity or availability of confidential information or disrupt normal business operations, and can, among other things, impair the Company’s ability to attract and retain customers for its products and services, impact the Company’s stock price, materially damage commercial relationships, and expose the Company to litigation or government investigations, which could result in penalties, fines or judgments against the Company. Globally, attacks are expected to continue accelerating in both frequency and sophistication with increasing use by actors of tools and techniques that are designed to circumvent controls, avoid detection, and remove or obfuscate forensic evidence, all of which hinders the Company’s ability to identify, investigate and recover from incidents. In addition, attacks against the Company and its customers can escalate during periods of severe diplomatic or armed conflict. Although malicious attacks perpetrated to gain access to confidential information, including personal information, affect many companies across various industries, the Company is at a relatively greater risk of being targeted because of its high profile and the value of the confidential information it creates, owns, manages, stores and processes. The Company has implemented systems and processes intended to secure its information technology systems and prevent unauthorized access to or loss of sensitive data, and mitigate the impact of unauthorized access, including through the use of encryption and authentication technologies. As with all companies, these security measures may not be sufficient for all eventualities and may be vulnerable to hacking, ransomware attacks, employee error, malfeasance, system error, faulty password management or other irregularities. For example, third parties can fraudulently induce the Company’s or its vendors’ employees or customers into disclosing usernames, passwords or other sensitive information, which can, in turn, be used for unauthorized access to the Company’s or its vendors’ systems and services. To help protect customers and the Company, the Company deploys and makes available technologies like multifactor authentication, monitors its services and systems for unusual activity and may freeze accounts under suspicious circumstances, which, among other things, can result in the delay or loss of customer orders or impede customer access to the Company’s products and services. While the Company maintains insurance coverage that is intended to address certain aspects of data security risks, such insurance coverage may be insufficient to cover all losses or all types of claims that may arise. Investment in new business strategies and acquisitions could disrupt the Company’s ongoing business, present risks not originally contemplated and materially adversely affect the Company’s business, reputation, results of operations and financial condition. The Company has invested, and in the future may invest, in new business strategies or acquisitions. Such endeavors may involve significant risks and uncertainties, including distraction of management from current operations, greater-than-expected liabilities and expenses, economic, political, legal and regulatory challenges associated with operating in new businesses, regions or countries, inadequate return on capital, potential impairment of tangible and intangible assets, and significant write-offs. Investment and acquisition transactions are exposed to additional risks, including failing to obtain required regulatory approvals on a timely basis or at all, or the imposition of onerous conditions that could delay or prevent the Company from completing a transaction or otherwise limit the Company’s ability to fully realize the anticipated benefits of a transaction. These new ventures are inherently risky and may not be successful. The failure of any significant investment could materially adversely affect the Company’s business, reputation, results of operations and financial condition. The Company’s retail stores are subject to numerous risks and uncertainties. The Company’s retail operations are subject to many factors that pose risks and uncertainties and could adversely impact the Company’s business, results of operations and financial condition, including macroeconomic factors that could have an adverse effect on general retail activity. Other factors include the Company’s ability to: manage costs associated with retail store construction and operation; manage relationships with existing retail partners; manage costs associated with fluctuations in the value of retail inventory; and obtain and renew leases in quality retail locations at a reasonable cost. The Company’s business, results of operations and financial condition could be adversely impacted by unfavorable results of legal proceedings or government investigations. The Company is subject to various claims, legal proceedings and government investigations that have arisen in the ordinary course of business and have not yet been fully resolved, and new matters may arise in the future. In addition, agreements entered into by the Company sometimes include indemnification provisions which can subject the Company to costs and damages in the event of a claim against an indemnified third party. The number of claims, legal proceedings and government investigations involving the Company, and the alleged magnitude of such claims, proceedings and government investigations, has generally increased over time and may continue to increase. The Company has faced and continues to face a significant number of patent claims relating to its cellular-enabled products, and new claims may arise in the future, including as a result of new legal or regulatory frameworks. For example, technology and other patent-holding companies frequently assert their patents and seek royalties and often enter into litigation based on allegations of patent infringement or other violations of intellectual property rights. The Company is vigorously defending infringement actions in courts in several U.S. jurisdictions, as well as internationally in various countries. The plaintiffs in these actions frequently seek injunctions and substantial damages. Regardless of the merit of particular claims, defending against litigation or responding to government investigations can be expensive, time-consuming and disruptive to the Company’s operations. In recognition of these considerations, the Company may enter into agreements or other arrangements to settle litigation and resolve such challenges. There can be no assurance such agreements can be obtained on acceptable terms or that litigation will not occur. These agreements can also significantly increase the Company’s cost of sales and operating expenses and require the Company to change its business practices and limit the Company’s ability to offer certain products and services. Except as described in Part I, Item 3 of this Form 10-K under the heading “Legal Proceedings” and in Part II, Item 8 of this Form 10-K in the Notes to Consolidated Financial Statements in Note 12, “Commitments, Contingencies and Supply Concentrations” under the heading “Contingencies,” in the opinion of management, there was not at least a reasonable possibility the Company may have incurred a material loss, or a material loss greater than a recorded accrual, concerning loss contingencies for asserted legal and other claims. The outcome of litigation or government investigations is inherently uncertain. If one or more legal matters were resolved against the Company or an indemnified third party in a reporting period for amounts above management’s expectations, the Company’s results of operations and financial condition for that reporting period could be materially adversely affected. Further, such an outcome can result in significant compensatory, punitive or trebled monetary damages, disgorgement of revenue or profits, remedial corporate measures or injunctive relief against the Company, and has from time to time required, and can in the future require, the Company to change its business practices and limit the Company’s ability to offer certain products and services, all of which could materially adversely affect the Company’s business, reputation, results of operations and financial condition. While the Company maintains insurance coverage for certain types of claims, such insurance coverage may be insufficient to cover all losses or all types of claims that may arise. The Company is subject to complex and changing laws and regulations worldwide, which exposes the Company to potential liabilities, increased costs and other adverse effects on the Company’s business. The Company’s global operations are subject to complex and changing laws and regulations on subjects, including antitrust; privacy, data security and data localization; consumer protection; advertising, sales, billing and e-commerce; financial services and technology; product liability; intellectual property ownership and infringement; digital platforms; machine learning and artificial intelligence; internet, telecommunications and mobile communications; media, television, film and digital content; availability of third-party software applications and services; labor and employment; anticorruption; import, export and trade; foreign exchange controls and cash repatriation restrictions; anti–money laundering; foreign ownership and investment; tax; and environmental, health and safety, including electronic waste, recycling, product design and climate change. Compliance with these laws and regulations is onerous and expensive. New and changing laws and regulations can adversely affect the Company’s business by increasing the Company’s costs, limiting the Company’s ability to offer a product, service or feature to customers, imposing changes to the design of the Company’s products and services, impacting customer demand for the Company’s products and services, and requiring changes to the Company’s supply chain and its business. New and changing laws and regulations can also create uncertainty about how such laws and regulations will be interpreted and applied. These risks and costs may increase as the Company’s products and services are introduced into specialized applications, including health and financial services. The Company has implemented policies and procedures designed to ensure compliance with applicable laws and regulations, but there can be no assurance the Company’s employees, contractors or agents will not violate such laws and regulations or the Company’s policies and procedures. If the Company is found to have violated laws and regulations, it could materially adversely affect the Company’s business, reputation, results of operations and financial condition. Regulatory changes and other actions that materially adversely affect the Company’s business may be announced with little or no advance notice and the Company may not be able to effectively mitigate all adverse impacts from such measures. For example, the Company is subject to changing regulations relating to the export and import of its products. Although the Company has programs, policies and procedures in place that are designed to satisfy regulatory requirements, there can be no assurance that such policies and procedures will be effective in preventing a violation or a claim of a violation. As a result, the Company’s products could be banned, delayed or prohibited from importation, which could materially adversely affect the Company’s business, reputation, results of operations and financial condition. Expectations relating to environmental, social and governance considerations and related reporting obligations expose the Company to potential liabilities, increased costs, reputational harm, and other adverse effects on the Company’s business. Many governments, regulators, investors, employees, customers and other stakeholders are increasingly focused on environmental, social and governance considerations relating to businesses, including climate change and greenhouse gas emissions, human and civil rights, and diversity, equity and inclusion. In addition, the Company makes statements about its goals and initiatives through its various non-financial reports, information provided on its website, press statements and other communications. Responding to these environmental, social and governance considerations and implementation of these goals and initiatives involves risks and uncertainties, requires investments, and depends in part on third-party performance or data that is outside the Company’s control. The Company cannot guarantee that it will achieve its announced environmental, social and governance goals and initiatives. In addition, some stakeholders may disagree with the Company’s goals and initiatives. Any failure, or perceived failure, by the Company to achieve its goals, further its initiatives, adhere to its public statements, comply with federal, state or international environmental, social and governance laws and regulations, or meet evolving and varied stakeholder expectations and standards could result in legal and regulatory proceedings against the Company and materially adversely affect the Company’s business, reputation, results of operations, financial condition and stock price. The technology industry, including, in some instances, the Company, is subject to intense media, political and regulatory scrutiny, which exposes the Company to increasing regulation, government investigations, legal actions and penalties. From time to time, the Company has made changes to its App Store, including actions taken in response to litigation, competition, market conditions and legal and regulatory requirements. The Company expects to make further business changes in the future, including as a result of legislative initiatives impacting the App Store, such as the EU Digital Markets Act, which the Company is required to comply with by March 2024, or similar laws in other jurisdictions. Changes have included how developers communicate with consumers outside the App Store regarding alternative purchasing mechanisms. Future changes could also affect what the Company charges developers for access to its platforms, how it manages distribution of apps outside of the App Store, and how and to what extent it allows developers to communicate with consumers inside the App Store regarding alternative purchasing mechanisms. The Company is also currently subject to antitrust investigations in various jurisdictions around the world, which can result in legal proceedings and claims against the Company that could, individually or in the aggregate, have a materially adverse impact on the Company’s business, results of operations and financial condition. For example, the Company is the subject of investigations in Europe and other jurisdictions relating to App Store terms and conditions. If such investigations result in adverse findings against the Company, the Company could be exposed to significant fines and may be required to make changes to its App Store business, all of which could materially adversely affect the Company’s business, results of operations and financial condition. The Company is also subject to litigation relating to the App Store, which has resulted in changes to the Company’s business practices, and may in the future result in further changes. Further, the Company has commercial relationships with other companies in the technology industry that are or may become subject to investigations and litigation that, if resolved against those other companies, could materially adversely affect the Company’s commercial relationships with those business partners and materially adversely affect the Company’s business, results of operations and financial condition. For example, the Company earns revenue from licensing arrangements with other companies to offer their search services on the Company’s platforms and applications, and certain of these arrangements are currently subject to government investigations and legal proceedings. There can be no assurance the Company’s business will not be materially adversely affected, individually or in the aggregate, by the outcomes of such investigations, litigation or changes to laws and regulations in the future. Changes to the Company’s business practices to comply with new laws and regulations or in connection with other legal proceedings could negatively impact the reputation of the Company’s products for privacy and security and otherwise adversely affect the experience for users of the Company’s products and services, and result in harm to the Company’s reputation, loss of competitive advantage, poor market acceptance, reduced demand for products and services, and lost sales. The Company’s business is subject to a variety of U.S. and international laws, rules, policies and other obligations regarding data protection. The Company is subject to an increasing number of federal, state and international laws relating to the collection, use, retention, security and transfer of various types of personal information. In many cases, these laws apply not only to third-party transactions, but also restrict transfers of personal information among the Company and its international subsidiaries. Several jurisdictions have passed laws in this area, and additional jurisdictions are considering imposing additional restrictions or have laws that are pending. These laws continue to develop and may be inconsistent from jurisdiction to jurisdiction. Complying with emerging and changing requirements causes the Company to incur substantial costs and has required and may in the future require the Company to change its business practices. Noncompliance could result in significant penalties or legal liability. The Company makes statements about its use and disclosure of personal information through its privacy policy, information provided on its website, press statements and other privacy notices provided to customers. Any failure by the Company to comply with these public statements or with other federal, state or international privacy or data protection laws and regulations could result in inquiries or proceedings against the Company by governmental entities or others. In addition to reputational impacts, penalties could include ongoing audit requirements and significant legal liability. In addition to the risks generally relating to the collection, use, retention, security and transfer of personal information, the Company is also subject to specific obligations relating to information considered sensitive under applicable laws, such as health data, financial data and biometric data. Health data and financial data are subject to additional privacy, security and breach notification requirements, and the Company is subject to audit by governmental authorities regarding the Company’s compliance with these obligations. If the Company fails to adequately comply with these rules and requirements, or if health data or financial data is handled in a manner not permitted by law or under the Company’s agreements with healthcare or financial institutions, the Company can be subject to litigation or government investigations, and can be liable for associated investigatory expenses, and can also incur significant fees or fines. Payment card data is also subject to additional requirements. Under payment card rules and obligations, if cardholder information is potentially compromised, the Company can be liable for associated investigatory expenses and can also incur significant fees or fines if the Company fails to follow payment card industry data security standards. The Company could also experience a significant increase in payment card transaction costs or lose the ability to process payment cards if it fails to follow payment card industry data security standards, which could materially adversely affect the Company’s business, reputation, results of operations and financial condition. The Company expects its quarterly net sales and results of operations to fluctuate. The Company’s profit margins vary across its products, services, geographic segments and distribution channels. For example, the gross margins on the Company’s products and services vary significantly and can change over time. The Company’s gross margins are subject to volatility and downward pressure due to a variety of factors, including: continued industry-wide global product pricing pressures and product pricing actions that the Company may take in response to such pressures; increased competition; the Company’s ability to effectively stimulate demand for certain of its products and services; compressed product life cycles; supply shortages; potential increases in the cost of components, outside manufacturing services, and developing, acquiring and delivering content for the Company’s services; the Company’s ability to manage product quality and warranty costs effectively; shifts in the mix of products and services, or in the geographic, currency or channel mix, including to the extent that regulatory changes require the Company to modify its product and service offerings; fluctuations in foreign exchange rates; inflation and other macroeconomic pressures; and the introduction of new products or services, including new products or services with higher cost structures. These and other factors could have a materially adverse impact on the Company’s results of operations and financial condition. The Company has historically experienced higher net sales in its first quarter compared to other quarters in its fiscal year due in part to seasonal holiday demand. Additionally, new product and service introductions can significantly impact net sales, cost of sales and operating expenses. Further, the Company generates a significant portion of its net sales from a single product and a decline in demand for that product could significantly impact quarterly net sales. The Company could also be subject to unexpected developments, such as lower-than-anticipated demand for the Company’s products or services, issues with new product or service introductions, information technology system failures or network disruptions, or failure of one of the Company’s logistics, components supply, or manufacturing partners. The Company’s financial performance is subject to risks associated with changes in the value of the U.S. dollar relative to local currencies. The Company’s primary exposure to movements in foreign exchange rates relates to non–U.S. dollar–denominated sales, cost of sales and operating expenses worldwide. Gross margins on the Company’s products in foreign countries and on products that include components obtained from foreign suppliers have in the past been adversely affected and could in the future be materially adversely affected by foreign exchange rate fluctuations. The weakening of foreign currencies relative to the U.S. dollar adversely affects the U.S. dollar value of the Company’s foreign currency–denominated sales and earnings, and generally leads the Company to raise international pricing, potentially reducing demand for the Company’s products. In some circumstances, for competitive or other reasons, the Company may decide not to raise international pricing to offset the U.S. dollar’s strengthening, which would adversely affect the U.S. dollar value of the gross margins the Company earns on foreign currency–denominated sales. Conversely, a strengthening of foreign currencies relative to the U.S. dollar, while generally beneficial to the Company’s foreign currency–denominated sales and earnings, could cause the Company to reduce international pricing or incur losses on its foreign currency derivative instruments, thereby limiting the benefit. Additionally, strengthening of foreign currencies may increase the Company’s cost of product components denominated in those currencies, thus adversely affecting gross margins. The Company uses derivative instruments, such as foreign currency forward and option contracts, to hedge certain exposures to fluctuations in foreign exchange rates. The use of such hedging activities may not be effective to offset any, or more than a portion, of the adverse financial effects of unfavorable movements in foreign exchange rates over the limited time the hedges are in place. The Company is exposed to credit risk and fluctuations in the values of its investment portfolio. The Company’s investments can be negatively affected by changes in liquidity, credit deterioration, financial results, market and economic conditions, political risk, sovereign risk, interest rate fluctuations or other factors. As a result, the value and liquidity of the Company’s cash, cash equivalents and marketable securities may fluctuate substantially. Therefore, although the Company has not realized any significant losses on its cash, cash equivalents and marketable securities, future fluctuations in their value could result in significant losses and could have a material adverse impact on the Company’s results of operations and financial condition. The Company is exposed to credit risk on its trade accounts receivable, vendor non-trade receivables and prepayments related to long-term supply agreements, and this risk is heightened during periods when economic conditions worsen. The Company distributes its products and certain of its services through third-party cellular network carriers, wholesalers, retailers and resellers. The Company also sells its products and services directly to small and mid-sized businesses and education, enterprise and government customers. A substantial majority of the Company’s outstanding trade receivables are not covered by collateral, third-party bank support or financing arrangements, or credit insurance, and a significant portion of the Company’s trade receivables can be concentrated within cellular network carriers or other resellers. The Company’s exposure to credit and collectibility risk on its trade receivables is higher in certain international markets and its ability to mitigate such risks may be limited. The Company also has unsecured vendor non-trade receivables resulting from purchases of components by outsourcing partners and other vendors that manufacture subassemblies or assemble final products for the Company. In addition, the Company has made prepayments associated with long-term supply agreements to secure supply of inventory components. As of September\\xa030, 2023, the Company’s vendor non-trade receivables and prepayments related to long-term supply agreements were concentrated among a few individual vendors located primarily in Asia. While the Company has procedures to monitor and limit exposure to credit risk on its trade and vendor non-trade receivables, as well as long-term prepayments, there can be no assurance such procedures will effectively limit its credit risk and avoid losses. The Company is subject to changes in tax rates, the adoption of new U.S. or international tax legislation and exposure to additional tax liabilities. The Company is subject to taxes in the U.S. and numerous foreign jurisdictions, including Ireland and Singapore, where a number of the Company’s subsidiaries are organized. Due to economic and political conditions, tax laws and tax rates for income taxes and other non-income taxes in various jurisdictions may be subject to significant change. For example, the Organisation for Economic Co-operation and Development continues to advance proposals for modernizing international tax rules, including the introduction of global minimum tax standards. The Company’s effective tax rates are affected by changes in the mix of earnings in countries with differing statutory tax rates, changes in the valuation of deferred tax assets and liabilities, the introduction of new taxes, and changes in tax laws or their interpretation. The application of tax laws may be uncertain, require significant judgment and be subject to differing interpretations. The Company is also subject to the examination of its tax returns and other tax matters by the U.S. Internal Revenue Service and other tax authorities and governmental bodies. The Company regularly assesses the likelihood of an adverse outcome resulting from these examinations to determine the adequacy of its provision for taxes. There can be no assurance as to the outcome of these examinations. If the Company’s effective tax rates were to increase, or if the ultimate determination of the Company’s taxes owed is for an amount in excess of amounts previously accrued, the Company’s business, results of operations and financial condition could be materially adversely affected. The price of the Company’s stock is subject to volatility. The Company’s stock has experienced substantial price volatility in the past and may continue to do so in the future. Additionally, the Company, the technology industry and the stock market as a whole have, from time to time, experienced extreme stock price and volume fluctuations that have affected stock prices in ways that may have been unrelated to these companies’ operating performance. Price volatility may cause the average price at which the Company repurchases its stock in a given period to exceed the stock’s price at a given point in time. The Company believes the price of its stock should reflect expectations of future growth and profitability. The Company also believes the price of its stock should reflect expectations that its cash dividend will continue at current levels or grow, and that its current share repurchase program will be fully consummated. Future dividends are subject to declaration by the Company’s Board of Directors, and the Company’s share repurchase program does not obligate it to acquire any specific number of shares. If the Company fails to meet expectations related to future growth, profitability, dividends, share repurchases or other market expectations, the price of the Company’s stock may decline significantly, which could have a material adverse impact on investor confidence and employee retention.',\n", + " 'UNRESOLVED_STAFF_COMMENTS': '',\n", + " 'PROPERTIES': 'The Company’s headquarters is located in Cupertino, California. As of September\\xa030, 2023, the Company owned or leased facilities and land for corporate functions, R&D, data centers, retail and other purposes at locations throughout the U.S. and in various places outside the U.S. The Company believes its existing facilities and equipment, which are used by all reportable segments, are in good operating condition and are suitable for the conduct of its business.',\n", + " 'LEGAL_PROCEEDINGS': 'Epic Games, Inc. (“Epic”) filed a lawsuit in the U.S. District Court for the Northern District of California (the “District Court”) against the Company alleging violations of federal and state antitrust laws and California’s unfair competition law based upon the Company’s operation of its App Store. On September 10, 2021, the District Court ruled in favor of the Company with respect to nine out of the ten counts included in Epic’s claim. The District Court found that certain provisions of the Company’s App Store Review Guidelines violate California’s unfair competition law and issued an injunction enjoining the Company from prohibiting developers from including in their apps external links that direct customers to purchasing mechanisms other than Apple in-app purchasing. The injunction applies to apps on the U.S. storefront of the iOS and iPadOS App Store. On April 24, 2023, the U.S. Court of Appeals for the Ninth Circuit (the “Circuit Court”) affirmed the District Court’s ruling. On June 7, 2023, the Company and Epic filed petitions with the Circuit Court requesting further review of the decision. On June 30, 2023, the Circuit Court denied both petitions. On July 17, 2023, the Circuit Court granted Apple’s motion to stay enforcement of the injunction pending appeal to the U.S. Supreme Court. If the U.S. Supreme Court denies Apple’s petition, the stay of the injunction will expire. Masimo Corporation and Cercacor Laboratories, Inc. (together, “Masimo”) filed a complaint before the U.S. International Trade Commission (the “ITC”) alleging infringement by the Company of five patents relating to the functionality of the blood oxygen feature in Apple Watch Series 6 and 7. In its complaint, Masimo sought a permanent exclusion order prohibiting importation to the United States of certain Apple Watch models that include blood oxygen sensing functionality. On October 26, 2023, the ITC entered a limited exclusion order (the “Order”) prohibiting importation and sales in the United States of Apple Watch models with blood oxygen sensing functionality, which includes Apple Watch Series 9 and Ultra 2. The Order will not go into effect until the end of the administrative review period, which is currently expected to end on December 25, 2023. The Company intends to appeal the Order and seek a stay pending the appeal. The Company is subject to other legal proceedings and claims that have not been fully resolved and that have arisen in the ordinary course of business. The Company settled certain matters during the fourth quarter of 2023 that did not individually or in the aggregate have a material impact on the Company’s financial condition or operating results. The outcome of litigation is inherently uncertain. If one or more legal matters were resolved against the Company in a reporting period for amounts above management’s expectations, the Company’s financial condition and operating results for that reporting period could be materially adversely affected.',\n", + " 'MINE_SAFETY': '',\n", + " 'MARKET_FOR_REGISTRANT_COMMON_EQUITY': '',\n", + " 'MANAGEMENT_DISCUSSION': 'The following discussion should be read in conjunction with the consolidated financial statements and accompanying notes included in Part II, Item 8 of this Form 10-K. This Item generally discusses 2023 and 2022 items and year-to-year comparisons between 2023 and 2022. Discussions of 2021 items and year-to-year comparisons between 2022 and 2021 are not included, and can be found in “Management’s Discussion and Analysis of Financial Condition and Results of Operations” in Part II, Item 7 of the Company’s Annual Report on Form 10-K for the fiscal year ended September\\xa024, 2022. The Company’s fiscal year is the 52- or 53-week period that ends on the last Saturday of September. An additional week is included in the first fiscal quarter every five or six years to realign the Company’s fiscal quarters with calendar quarters, which occurred in the first quarter of 2023. The Company’s fiscal year 2023 spanned 53 weeks, whereas fiscal years 2022 and 2021 spanned 52 weeks each. The Company’s total net sales were $383.3 billion and net income was $97.0 billion during 2023. The Company’s total net sales decreased 3% or $11.0 billion during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar accounted for more than the entire year-over-year decrease in total net sales, which consisted primarily of lower net sales of Mac and iPhone, partially offset by higher net sales of Services. The Company announces new product, service and software offerings at various times during the year. Significant announcements during fiscal year 2023 included the following: iPad and iPad Pro; Next-generation Apple TV 4K; and MLS Season Pass, a Major League Soccer subscription streaming service. MacBook Pro 14”, MacBook Pro 16” and Mac mini; and Second-generation HomePod. MacBook Air 15”, Mac Studio and Mac Pro; Apple Vision Pro™, the Company’s first spatial computer featuring its new visionOS™, expected to be available in early calendar year 2024; and iOS 17, macOS Sonoma, iPadOS 17, tvOS 17 and watchOS 10, updates to the Company’s operating systems. iPhone 15, iPhone 15 Plus, iPhone 15 Pro and iPhone 15 Pro Max; and Apple Watch Series 9 and Apple Watch Ultra 2. In May 2023, the Company announced a new share repurchase program of up to $90 billion and raised its quarterly dividend from $0.23 to $0.24 per share beginning in May 2023. During 2023, the Company repurchased $76.6 billion of its common stock and paid dividends and dividend equivalents of $15.0 billion. Macroeconomic conditions, including inflation, changes in interest rates, and currency fluctuations, have directly and indirectly impacted, and could in the future materially impact, the Company’s results of operations and financial condition. The following table shows net sales by reportable segment for 2023, 2022 and 2021 (dollars in millions): Americas net sales decreased 4% or $7.1 billion during 2023 compared to 2022 due to lower net sales of iPhone and Mac, partially offset by higher net sales of Services. Europe net sales decreased 1% or $824 million during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Europe net sales, which consisted primarily of lower net sales of Mac and Wearables, Home and Accessories, partially offset by higher net sales of iPhone and Services. Greater China net sales decreased 2% or $1.6 billion during 2023 compared to 2022. The weakness in the renminbi relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Greater China net sales, which consisted primarily of lower net sales of Mac and iPhone. Japan net sales decreased 7% or $1.7 billion during 2023 compared to 2022. The weakness in the yen relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Japan net sales, which consisted primarily of lower net sales of iPhone, Wearables, Home and Accessories and Mac. Rest of Asia Pacific net sales increased 1% or $240 million during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar had a significantly unfavorable year-over-year impact on Rest of Asia Pacific net sales. The net sales increase consisted of higher net sales of iPhone and Services, partially offset by lower net sales of Mac and iPad. The following table shows net sales by category for 2023, 2022 and 2021 (dollars in millions): Products net sales include amortization of the deferred value of unspecified software upgrade rights, which are bundled in the sales price of the respective product. Services net sales include amortization of the deferred value of services bundled in the sales price of certain products. iPhone net sales decreased 2% or $4.9 billion during 2023 compared to 2022 due to lower net sales of non-Pro iPhone models, partially offset by higher net sales of Pro iPhone models. Mac net sales decreased 27% or $10.8 billion during 2023 compared to 2022 due primarily to lower net sales of laptops. iPad net sales decreased 3% or $1.0 billion during 2023 compared to 2022 due primarily to lower net sales of iPad mini and iPad Air, partially offset by the combined net sales of iPad 9th and 10th generation. Wearables, Home and Accessories net sales decreased 3% or $1.4 billion during 2023 compared to 2022 due primarily to lower net sales of Wearables and Accessories. Services net sales increased 9% or $7.1 billion during 2023 compared to 2022 due to higher net sales across all lines of business. Products and Services gross margin and gross margin percentage for 2023, 2022 and 2021 were as follows (dollars in millions): Products gross margin decreased during 2023 compared to 2022 due to the weakness in foreign currencies relative to the U.S. dollar and lower Products volume, partially offset by cost savings and a different Products mix. Products gross margin percentage increased during 2023 compared to 2022 due to cost savings and a different Products mix, partially offset by the weakness in foreign currencies relative to the U.S. dollar and decreased leverage. Services gross margin increased during 2023 compared to 2022 due primarily to higher Services net sales, partially offset by the weakness in foreign currencies relative to the U.S. dollar and higher Services costs. Services gross margin percentage decreased during 2023 compared to 2022 due to higher Services costs and the weakness in foreign currencies relative to the U.S. dollar, partially offset by a different Services mix. The Company’s future gross margins can be impacted by a variety of factors, as discussed in Part I, Item 1A of this Form 10-K under the heading “Risk Factors.” As a result, the Company believes, in general, gross margins will be subject to volatility and downward pressure. Operating expenses for 2023, 2022 and 2021 were as follows (dollars in millions): Selling, general and administrative The year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses. Selling, general and administrative expense was relatively flat in 2023 compared to 2022. Provision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions): The Company’s effective tax rate for 2023 and 2022 was lower than the statutory federal income tax rate due primarily to a lower effective tax rate on foreign earnings, the impact of the U.S. federal R&D credit, and tax benefits from share-based compensation, partially offset by state income taxes. The Company’s effective tax rate for 2023 was lower compared to 2022 due primarily to a lower effective tax rate on foreign earnings and the impact of U.S. foreign tax credit regulations issued by the U.S. Department of the Treasury in 2022, partially offset by lower tax benefits from share-based compensation. The Company believes its balances of cash, cash equivalents and unrestricted marketable securities, which totaled $148.3\\xa0billion as of September\\xa030, 2023, along with cash generated by ongoing operations and continued access to debt markets, will be sufficient to satisfy its cash requirements and capital return program over the next 12 months and beyond. The Company’s material cash requirements include the following contractual obligations: As of September\\xa030, 2023, the Company had outstanding fixed-rate notes with varying maturities for an aggregate principal amount of $106.6 billion (collectively the “Notes”), with $9.9 billion payable within 12 months. Future interest payments associated with the Notes total $41.1 billion, with $2.9 billion payable within 12 months. The Company also issues unsecured short-term promissory notes pursuant to a commercial paper program. As of September\\xa030, 2023, the Company had $6.0 billion of commercial paper outstanding, all of which was payable within 12 months. The Company has lease arrangements for certain equipment and facilities, including corporate, data center, manufacturing and retail space. As of September\\xa030, 2023, the Company had fixed lease payment obligations of $15.8 billion, with $2.0 billion payable within 12 months. The Company utilizes several outsourcing partners to manufacture subassemblies for the Company’s products and to perform final assembly and testing of finished products. The Company also obtains individual components for its products from a wide variety of individual suppliers. As of September\\xa030, 2023, the Company had manufacturing purchase obligations of $53.1 billion, with $52.9 billion payable within 12 months. The Company’s manufacturing purchase obligations are primarily noncancelable. The Company’s other purchase obligations primarily consist of noncancelable obligations to acquire capital assets, including assets related to product manufacturing, and noncancelable obligations related to supplier arrangements, licensed intellectual property and content, and distribution rights. As of September\\xa030, 2023, the Company had other purchase obligations of $21.9 billion, with $5.6 billion payable within 12 months. As of September\\xa030, 2023, the balance of the deemed repatriation tax payable imposed by the U.S. Tax Cuts and Jobs Act of 2017 (the “Act”) was $22.0\\xa0billion, with $6.5\\xa0billion expected to be paid within 12 months. In addition to its contractual cash requirements, the Company has an authorized share repurchase program. The program does not obligate the Company to acquire a minimum amount of shares. As of September\\xa030, 2023, the Company’s quarterly cash dividend was $0.24 per share. The Company intends to increase its dividend on an annual basis, subject to declaration by the Board of Directors. The preparation of financial statements and related disclosures in conformity with U.S. generally accepted accounting principles (“GAAP”) and the Company’s discussion and analysis of its financial condition and operating results require the Company’s management to make judgments, assumptions and estimates that affect the amounts reported. Note 1, “Summary of Significant Accounting Policies” of the Notes to Consolidated Financial Statements in Part II, Item 8 of this Form 10-K describes the significant accounting policies and methods used in the preparation of the Company’s consolidated financial statements. Management bases its estimates on historical experience and on various other assumptions it believes to be reasonable under the circumstances, the results of which form the basis for making judgments about the carrying values of assets and liabilities. The Company is subject to income taxes in the U.S. and numerous foreign jurisdictions. The evaluation of the Company’s uncertain tax positions involves significant judgment in the interpretation and application of GAAP and complex domestic and international tax laws, including the Act and matters related to the allocation of international taxation rights between countries. Although management believes the Company’s reserves are reasonable, no assurance can be given that the final outcome of these uncertainties will not be different from that which is reflected in the Company’s reserves. Reserves are adjusted considering changing facts and circumstances, such as the closing of a tax examination. Resolution of these uncertainties in a manner inconsistent with management’s expectations could have a material impact on the Company’s financial condition and operating results. The Company is subject to various legal proceedings and claims that arise in the ordinary course of business, the outcomes of which are inherently uncertain. The Company records a liability when it is probable that a loss has been incurred and the amount is reasonably estimable, the determination of which requires significant judgment. Resolution of legal matters in a manner inconsistent with management’s expectations could have a material impact on the Company’s financial condition and operating results.',\n", + " 'MARKET_RISK_DISCLOSURES': 'The Company is exposed to economic risk from interest rates and foreign exchange rates. The Company uses various strategies to manage these risks; however, they may still impact the Company’s consolidated financial statements. The Company is primarily exposed to fluctuations in U.S. interest rates and their impact on the Company’s investment portfolio and term debt. Increases in interest rates will negatively affect the fair value of the Company’s investment portfolio and increase the interest expense on the Company’s term debt. To protect against interest rate risk, the Company may use derivative instruments, offset interest rate–sensitive assets and liabilities, or control duration of the investment and term debt portfolios. The following table sets forth potential impacts on the Company’s investment portfolio and term debt, including the effects of any associated derivatives, that would result from a hypothetical increase in relevant interest rates as of September\\xa030, 2023 and September\\xa024, 2022 (dollars in millions): The Company’s exposure to foreign exchange rate risk relates primarily to the Company being a net receiver of currencies other than the U.S. dollar. Changes in exchange rates, and in particular a strengthening of the U.S. dollar, will negatively affect the Company’s net sales and gross margins as expressed in U.S. dollars. Fluctuations in exchange rates may also affect the fair values of certain of the Company’s assets and liabilities. To protect against foreign exchange rate risk, the Company may use derivative instruments, offset exposures, or adjust local currency pricing of its products and services. However, the Company may choose to not hedge certain foreign currency exposures for a variety of reasons, including accounting considerations or prohibitive cost. The Company applied a value-at-risk (“VAR”) model to its foreign currency derivative positions to assess the potential impact of fluctuations in exchange rates. The VAR model used a Monte Carlo simulation. The VAR is the maximum expected loss in fair value, for a given confidence interval, to the Company’s foreign currency derivative positions due to adverse movements in rates. Based on the results of the model, the Company estimates, with 95% confidence, a maximum one-day loss in fair value of $669 million and $1.0 billion as of September\\xa030, 2023 and September\\xa024, 2022, respectively. Changes in the Company’s underlying foreign currency exposures, which were excluded from the assessment, generally offset changes in the fair values of the Company’s foreign currency derivatives.',\n", + " 'FINANCIAL_STATEMENTS': 'Consolidated Statements of Operations for the years ended September 30, 2023, September 24, 2022 and September 25, 2021 All financial statement schedules have been omitted, since the required information is not applicable or is not present in amounts sufficient to require submission of the schedule, or because the information required is included in the consolidated financial statements and accompanying notes. (In millions, except number of shares, which are reflected in thousands, and per-share amounts) Years ended Selling, general and administrative Operating income Shares used in computing earnings per share: Years ended Adjustment for net (gains)/losses realized and included in net income Adjustment for net (gains)/losses realized and included in net income (In millions, except number of shares, which are reflected in thousands, and par value) Shareholders’ equity: Common stock and additional paid-in capital, $0.00001 par value: 50,400,000 shares authorized; 15,550,061 and 15,943,425 shares issued and outstanding, respectively Accumulated deficit Accumulated other comprehensive loss Total shareholders’ equity Total liabilities and shareholders’ equity Years ended Total shareholders’ equity, beginning balances Beginning balances Common stock issued Common stock withheld related to net share settlement of equity awards Ending balances Beginning balances Dividends and dividend equivalents declared Common stock withheld related to net share settlement of equity awards Common stock repurchased Ending balances Accumulated other comprehensive income/(loss): Beginning balances Ending balances Total shareholders’ equity, ending balances Dividends and dividend equivalents declared per share or RSU Years ended Cash, cash equivalents and restricted cash, beginning balances Adjustments to reconcile net income to cash generated by operating activities: Cash generated by operating activities Investing activities: Cash generated by/(used in) investing activities Financing activities: Payments for taxes related to net share settlement of equity awards Proceeds from/(Repayments of) commercial paper, net Cash used in financing activities Increase/(Decrease) in cash, cash equivalents and restricted cash Cash, cash equivalents and restricted cash, ending balances Cash paid for income taxes, net Cash paid for interest The consolidated financial statements include the accounts of Apple Inc. and its wholly owned subsidiaries. The preparation of these consolidated financial statements and accompanying notes in conformity with GAAP requires the use of management estimates. Certain prior period amounts in the consolidated financial statements and accompanying notes have been reclassified to conform to the current period’s presentation. The Company’s fiscal year is the 52- or 53-week period that ends on the last Saturday of September. An additional week is included in the first fiscal quarter every five or six years to realign the Company’s fiscal quarters with calendar quarters, which occurred in the first fiscal quarter of 2023. The Company’s fiscal year 2023 spanned 53 weeks, whereas fiscal years 2022 and 2021 spanned 52 weeks each. Unless otherwise stated, references to particular years, quarters, months and periods refer to the Company’s fiscal years ended in September and the associated quarters, months and periods of those fiscal years. The Company records revenue net of taxes collected from customers that are remitted to governmental authorities. The Company recognizes share-based compensation expense on a straight-line basis for its estimate of equity awards that will ultimately vest. All highly liquid investments with maturities of three months or less at the date of purchase are treated as cash equivalents. The cost of securities sold is determined using the specific identification method. Inventories are measured using the first-in, first-out method. Depreciation on property, plant and equipment is recognized on a straight-line basis. The Company presents derivative assets and liabilities at their gross fair values in the Consolidated Balance Sheets. The Company records certain deferred tax assets and liabilities in connection with the minimum tax on certain foreign earnings created by the Act. The Company recognizes revenue at the amount to which it expects to be entitled when control of the products or services is transferred to its customers. Control is generally transferred when the Company has a present right to payment and title and the significant risks and rewards of ownership of products or services are transferred to its customers. For most of the Company’s Products net sales, control transfers when products are shipped. For the Company’s Services net sales, control transfers over time as services are delivered. Payment for Products and Services net sales is collected within a short period following transfer of control or commencement of delivery of services, as applicable. The Company records reductions to Products net sales related to future product returns, price protection and other customer incentive programs based on the Company’s expectations and historical experience. For arrangements with multiple performance obligations, which represent promises within an arrangement that are distinct, the Company allocates revenue to all distinct performance obligations based on their relative stand-alone selling prices (“SSPs”). When available, the Company uses observable prices to determine SSPs. When observable prices are not available, SSPs are established that reflect the Company’s best estimates of what the selling prices of the performance obligations would be if they were sold regularly on a stand-alone basis. The Company’s process for estimating SSPs without observable prices considers multiple factors that may vary depending upon the unique facts and circumstances related to each performance obligation including, where applicable, prices charged by the Company for similar offerings, market trends in the pricing for similar offerings, product-specific business objectives and the estimated cost to provide the performance obligation. The Company has identified up to three performance obligations regularly included in arrangements involving the sale of iPhone, Mac, iPad and certain other products. The first performance obligation, which represents the substantial portion of the allocated sales price, is the hardware and bundled software delivered at the time of sale. The second performance obligation is the right to receive certain product-related bundled services, which include iCloud and Maps. The third performance obligation is the right to receive, on a when-and-if-available basis, future unspecified software upgrades relating to the software bundled with each device. The Company allocates revenue and any related discounts to these performance obligations based on their relative SSPs. Because the Company lacks observable prices for the undelivered performance obligations, the allocation of revenue is based on the Company’s estimated SSPs. Revenue allocated to the delivered hardware and bundled software is recognized when control has transferred to the customer, which generally occurs when the product is shipped. Revenue allocated to the product-related bundled services and unspecified software upgrade rights is deferred and recognized on a straight-line basis over the estimated period they are expected to be provided. For certain long-term service arrangements, the Company has performance obligations for services it has not yet delivered. For these arrangements, the Company does not have a right to bill for the undelivered services. The Company has determined that any unbilled consideration relates entirely to the value of the undelivered services. Accordingly, the Company has not recognized revenue, and does not disclose amounts, related to these undelivered services. For the sale of third-party products where the Company obtains control of the product before transferring it to the customer, the Company recognizes revenue based on the gross amount billed to customers. The Company considers multiple factors when determining whether it obtains control of third-party products, including evaluating if it can establish the price of the product, retains inventory risk for tangible products or has the responsibility for ensuring acceptability of the product. For third-party applications sold through the App Store, the Company does not obtain control of the product before transferring it to the customer. Therefore, the Company accounts for all third-party application–related sales on a net basis by recognizing in Services net sales only the commission it retains. Net sales disaggregated by significant products and services for 2023, 2022 and 2021 were as follows (in millions): Products net sales include amortization of the deferred value of unspecified software upgrade rights, which are bundled in the sales price of the respective product. Services net sales include amortization of the deferred value of services bundled in the sales price of certain products. Total net sales include $8.2 billion of revenue recognized in 2023 that was included in deferred revenue as of September\\xa024, 2022, $7.5 billion of revenue recognized in 2022 that was included in deferred revenue as of September\\xa025, 2021, and $6.7 billion of revenue recognized in 2021 that was included in deferred revenue as of September\\xa026, 2020. The Company’s proportion of net sales by disaggregated revenue source was generally consistent for each reportable segment in Note 13, “Segment Information and Geographic Data” for 2023, 2022 and 2021, except in Greater China, where iPhone revenue represented a moderately higher proportion of net sales. The following table shows the computation of basic and diluted earnings per share for 2023, 2022 and 2021 (net income in millions and shares in thousands): Weighted-average diluted shares Diluted earnings per share Approximately 24\\xa0million restricted stock units (“RSUs”) were excluded from the computation of diluted earnings per share for 2023 because their effect would have been antidilutive. The following tables show the Company’s cash, cash equivalents and marketable securities by significant investment category as of September\\xa030, 2023 and September\\xa024, 2022 (in millions): The valuation techniques used to measure the fair values of the Company’s Level 2 financial instruments, which generally have counterparties with high credit ratings, are based on quoted market prices or model-driven valuations using significant inputs derived from or corroborated by observable market data. As of September\\xa030, 2023 and September\\xa024, 2022, total marketable securities included $13.8 billion and $12.7 billion, respectively, that were restricted from general use, related to the State Aid Decision (refer to Note 7, “Income Taxes”) and other agreements. The following table shows the fair value of the Company’s non-current marketable debt securities, by contractual maturity, as of September\\xa030, 2023 (in millions): The Company’s investments in marketable debt securities have been classified and accounted for as available-for-sale. The Company classifies marketable debt securities as either current or non-current based solely on each instrument’s underlying contractual maturity date. The Company may use derivative instruments to partially offset its business exposure to foreign exchange and interest rate risk. However, the Company may choose not to hedge certain exposures for a variety of reasons including accounting considerations or the prohibitive economic cost of hedging particular exposures. There can be no assurance the hedges will offset more than a portion of the financial impact resulting from movements in foreign exchange or interest rates. The Company classifies cash flows related to derivative instruments in the same section of the Consolidated Statements of Cash Flows as the items being hedged, which are generally classified as operating activities. To protect gross margins from fluctuations in foreign exchange rates, the Company may use forwards, options or other instruments, and may designate these instruments as cash flow hedges. The Company generally hedges portions of its forecasted foreign currency exposure associated with revenue and inventory purchases, typically for up to 12 months. To protect the Company’s foreign currency–denominated term debt or marketable securities from fluctuations in foreign exchange rates, the Company may use forwards, cross-currency swaps or other instruments. The Company designates these instruments as either cash flow or fair value hedges. As of September\\xa030, 2023, the maximum length of time over which the Company is hedging its exposure to the variability in future cash flows for term debt–related foreign currency transactions is 19 years. The Company may also use derivative instruments that are not designated as accounting hedges to protect gross margins from certain fluctuations in foreign exchange rates, as well as to offset a portion of the foreign currency gains and losses generated by the remeasurement of certain assets and liabilities denominated in non-functional currencies. To protect the Company’s term debt or marketable securities from fluctuations in interest rates, the Company may use interest rate swaps, options or other instruments. The Company designates these instruments as either cash flow or fair value hedges. The notional amounts of the Company’s outstanding derivative instruments as of September\\xa030, 2023 and September\\xa024, 2022 were as follows (in millions): Derivative instruments designated as accounting hedges: Derivative instruments not designated as accounting hedges: The gross fair values of the Company’s derivative assets and liabilities as of September\\xa024, 2022 were as follows (in millions): Derivative assets are measured using Level 2 fair value inputs and are included in other current assets and other non-current assets in the Consolidated Balance Sheet. Derivative liabilities are measured using Level 2 fair value inputs and are included in other current liabilities and other non-current liabilities in the Consolidated Balance Sheet. The derivative assets above represent the Company’s gross credit exposure if all counterparties failed to perform. To mitigate credit risk, the Company generally uses collateral security arrangements that provide for collateral to be received or posted when the net fair values of certain derivatives fluctuate from contractually established thresholds. To further limit credit risk, the Company generally uses master netting arrangements with the respective counterparties to the Company’s derivative contracts, under which the Company is allowed to settle transactions with a single net amount payable by one party to the other. As of September\\xa024, 2022, the potential effects of these rights of set-off associated with the Company’s derivative contracts, including the effects of collateral, would be a reduction to both derivative assets and derivative liabilities of $7.8 billion, resulting in a net derivative asset of $412 million. The carrying amounts of the Company’s hedged items in fair value hedges as of September\\xa030, 2023 and September\\xa024, 2022 were as follows (in millions): Hedged assets/(liabilities): As of September 24, 2022, the Company had one customer that represented 10% or more of total trade receivables, which accounted for 10%. The Company’s third-party cellular network carriers accounted for 41% and 44% of total trade receivables as of September\\xa030, 2023 and September\\xa024, 2022, respectively. The Company requires third-party credit support or collateral from certain customers to limit credit risk. The following table shows the Company’s gross property, plant and equipment by major asset class and accumulated depreciation as of September\\xa030, 2023 and September\\xa024, 2022 (in millions): Accumulated depreciation Depreciation expense on property, plant and equipment was $8.5 billion, $8.7 billion and $9.5 billion during 2023, 2022 and 2021, respectively. The following tables show the Company’s consolidated financial statement details as of September\\xa030, 2023 and September\\xa024, 2022 (in millions): The following table shows the detail of other income/(expense), net for 2023, 2022 and 2021 (in millions): The provision for income taxes for 2023, 2022 and 2021, consisted of the following (in millions): The foreign provision for income taxes is based on foreign pretax earnings of $72.9 billion, $71.3 billion and $68.7 billion in 2023, 2022 and 2021, respectively. A reconciliation of the provision for income taxes to the amount computed by applying the statutory federal income tax rate (21% in 2023, 2022 and 2021) to income before provision for income taxes for 2023, 2022 and 2021, is as follows (dollars in millions): Computed expected tax As of September\\xa030, 2023 and September\\xa024, 2022, the significant components of the Company’s deferred tax assets and liabilities were (in millions): Accrued liabilities and other reserves Capitalized research and development Net deferred tax assets As of September\\xa030, 2023, the Company had $5.2\\xa0billion in foreign tax credit carryforwards in Ireland and $3.0\\xa0billion in California R&D credit carryforwards, both of which can be carried forward indefinitely. A valuation allowance has been recorded for the credit carryforwards and a portion of other temporary differences. As of September\\xa030, 2023, the total amount of gross unrecognized tax benefits was $19.5 billion, of which $9.5 billion, if recognized, would impact the Company’s effective tax rate. As of September\\xa024, 2022, the total amount of gross unrecognized tax benefits was $16.8 billion, of which $8.0 billion, if recognized, would have impacted the Company’s effective tax rate. The aggregate change in the balance of gross unrecognized tax benefits, which excludes interest and penalties, for 2023, 2022 and 2021, is as follows (in millions): Beginning balances Increases related to tax positions taken during a prior year Decreases related to tax positions taken during a prior year Increases related to tax positions taken during the current year Decreases related to settlements with taxing authorities Decreases related to expiration of the statute of limitations Ending balances The Company is subject to taxation and files income tax returns in the U.S. federal jurisdiction and many state and foreign jurisd ictions. Tax years after 2017 for the U.S. federal jurisdiction, and after 2014 in certain major foreign jurisdictions, remain subject to examination. Altho ugh the timing of resolution or closure of examinations is not certain, the Company believes it is reasonably possible that its gross unrecognized tax benefits could decrease in the next 12 months by as much as $4.5\\xa0billion. On August 30, 2016, the European Commission announced its decision that Ireland granted state aid to the Company by providing tax opinions in 1991 and 2007 concerning the tax allocation of profits of the Irish branches of two subsidiaries of the Company (the “State Aid Decision”). The State Aid Decision ordered Ireland to calculate and recover additional taxes from the Company for the period June 2003 through December 2014. Irish legislative changes, effective as of January 2015, eliminated the application of the tax opinions from that date forward. The recovery amount was calculated to be €13.1 billion, plus interest of €1.2 billion. The Company and Ireland appealed the State Aid Decision to the General Court of the Court of Justice of the European Union (the “General Court”). On July 15, 2020, the General Court annulled the State Aid Decision. On September 25, 2020, the European Commission appealed the General Court’s decision to the European Court of Justice (the “ECJ”) and a hearing was held on May 23, 2023. A decision from the ECJ is expected in calendar year 2024. The Company believes it would be eligible to claim a U.S. foreign tax credit for a portion of any incremental Irish corporate income taxes potentially due related to the State Aid Decision. On an annual basis, the Company may request approval from the Irish Minister for Finance to reduce the recovery amount for certain taxes paid to other countries. As of September\\xa030, 2023, the adjusted recovery amount was €12.7 billion, excluding interest. The adjusted recovery amount plus interest is funded into escrow, where it will remain restricted from general use pending the conclusion of all legal proceedings. Refer to the Cash, Cash Equivalents and Marketable Securities section of Note 4, “Financial Instruments” for more information. The Company has lease arrangements for certain equipment and facilities, including corporate, data center, manufacturing and retail space. These leases typically have original terms not exceeding 10 years and generally contain multiyear renewal options, some of which are reasonably certain of exercise. The following table shows ROU assets and lease liabilities, and the associated financial statement line items, as of September\\xa030, 2023 and September\\xa024, 2022 (in millions): Lease liability maturities as of September\\xa030, 2023, are as follows (in millions): The weighted-average remaining lease term related to the Company’s lease liabilities as of September\\xa030, 2023 and September\\xa024, 2022 was 10.6 years and 10.1 years, respectively. The discount rate related to the Company’s lease liabilities as of September\\xa030, 2023 and September\\xa024, 2022 was 3.0% and 2.3%, respectively. The discount rates related to the Company’s lease liabilities are generally based on estimates of the Company’s incremental borrowing rate, as the discount rates implicit in the Company’s leases cannot be readily determined. As of September\\xa030, 2023, the Company had $544 million of future payments under additional leases, primarily for corporate facilities and retail space, that had not yet commenced. These leases will commence between 2024 and 2026, with lease terms ranging from 1 year to 21 years. Proceeds from/(Repayments of) commercial paper, net Proceeds from/(Repayments of) commercial paper, net The Company has outstanding Notes, which are senior unsecured obligations with interest payable in arrears. The following table provides a summary of the Company’s term debt as of September\\xa030, 2023 and September\\xa024, 2022: Unamortized premium/(discount) and issuance costs, net To manage interest rate risk on certain of its U.S. dollar–denominated fixed-rate notes, the Company uses interest rate swaps to effectively convert the fixed interest rates to floating interest rates on a portion of these notes. Additionally, to manage foreign exchange rate risk on certain of its foreign currency–denominated notes, the Company uses cross-currency swaps to effectively convert these notes to U.S. dollar–denominated notes. The effective interest rates for the Notes include the interest on the Notes, amortization of the discount or premium and, if applicable, adjustments related to hedging. The Company recognized $3.7 billion, $2.8 billion and $2.6 billion of interest expense on its term debt for 2023, 2022 and 2021, respectively. The future principal payments for the Company’s Notes as of September\\xa030, 2023, are as follows (in millions): As of September\\xa030, 2023 and September\\xa024, 2022, the fair value of the Company’s Notes, based on Level 2 inputs, was $90.8 billion and $98.8 billion, respectively. During 2023, the Company repurchased 471 million shares of its common stock for $76.6 billion, excluding excise tax due under the Inflation Reduction Act of 2022. The Company’s share repurchase programs do not obligate the Company to acquire a minimum amount of shares. Under the programs, shares may be repurchased in privately negotiated or open market transactions, including under plans complying with Rule 10b5-1 under the Exchange Act. The following table shows the changes in shares of common stock for 2023, 2022 and 2021 (in thousands): Common stock outstanding, beginning balances Common stock repurchased Common stock issued, net of shares withheld for employee taxes Common stock outstanding, ending balances The Apple Inc. 2022 Employee Stock Plan (the “2022 Plan”) is a shareholder-approved plan that provides for broad-based equity grants to employees, including executive officers, and permits the granting of RSUs, stock grants, performance-based awards, stock options and stock appreciation rights. RSUs granted under the 2022 Plan generally vest over four years, based on continued employment, and are settled upon vesting in shares of the Company’s common stock on a one-for-one basis. All RSUs granted under the 2022 Plan have dividend equivalent rights, which entitle holders of RSUs to the same dividend value per share as holders of common stock. A maximum of approximately 1.3 billion shares were authorized for issuance pursuant to 2022 Plan awards at the time the plan was approved on March 4, 2022. The Apple Inc. 2014 Employee Stock Plan (the “2014 Plan”) is a shareholder-approved plan that provided for broad-based equity grants to employees, including executive officers. The 2014 Plan permitted the granting of substantially the same types of equity awards with substantially the same terms as the 2022 Plan. The 2014 Plan also permitted the granting of cash bonus awards. In the third quarter of 2022, the Company terminated the authority to grant new awards under the 2014 Plan. A summary of the Company’s RSU activity and related information for 2023, 2022 and 2021, is as follows: RSUs granted RSUs vested RSUs canceled RSUs granted RSUs vested RSUs canceled RSUs granted RSUs vested RSUs canceled The following table shows share-based compensation expense and the related income tax benefit included in the Consolidated Statements of Operations for 2023, 2022 and 2021 (in millions): Income tax benefit related to share-based compensation expense As of September\\xa030, 2023, the total unrecognized compensation cost related to outstanding RSUs was $18.6 billion, which the Company expects to recognize over a weighted-average period of 2.5 years. The Company has entered into certain off–balance sheet commitments that require the future purchase of goods or services (“unconditional purchase obligations”). The Company’s unconditional purchase obligations primarily consist of supplier arrangements, licensed intellectual property and content, and distribution rights. Future payments under noncancelable unconditional purchase obligations with a remaining term in excess of one year as of September\\xa030, 2023, are as follows (in millions): The Company is subject to various legal proceedings and claims that have arisen in the ordinary course of business and that have not been fully resolved. The outcome of litigation is inherently uncertain. In the opinion of management, there was not at least a reasonable possibility the Company may have incurred a material loss, or a material loss greater than a recorded accrual, concerning loss contingencies for asserted legal and other claims. Although most components essential to the Company’s business are generally available from multiple sources, certain components are currently obtained from single or limited sources. The Company also competes for various components with other participants in the markets for smartphones, personal computers, tablets, wearables and accessories. Therefore, many components used by the Company, including those that are available from multiple sources, are at times subject to industry-wide shortage and significant commodity pricing fluctuations. The Company uses some custom components that are not commonly used by its competitors, and new products introduced by the Company often utilize custom components available from only one source. When a component or product uses new technologies, initial capacity constraints may exist until the suppliers’ yields have matured or their manufacturing capacities have increased. The continued availability of these components at acceptable prices, or at all, may be affected if suppliers decide to concentrate on the production of common components instead of components customized to meet the Company’s requirements. Substantially all of the Company’s hardware products are manufactured by outsourcing partners that are located primarily in China mainland, India, Japan, South Korea, Taiwan and Vietnam. The Company manages its business primarily on a geographic basis. The Company’s reportable segments consist of the Americas, Europe, Greater China, Japan and Rest of Asia Pacific. Americas includes both North and South America. Europe includes European countries, as well as India, the Middle East and Africa. Greater China includes China mainland, Hong Kong and Taiwan. Rest of Asia Pacific includes Australia and those Asian countries not included in the Company’s other reportable segments. Although the reportable segments provide similar hardware and software products and similar services, each one is managed separately to better align with the location of the Company’s customers and distribution partners and the unique market dynamics of each geographic region. The Company evaluates the performance of its reportable segments based on net sales and operating income. Net sales for geographic segments are generally based on the location of customers and sales through the Company’s retail stores located in those geographic locations. Operating income for each segment consists of net sales to third parties, related cost of sales, and operating expenses directly attributable to the segment. The information provided to the Company’s chief operating decision maker for purposes of making decisions and assessing segment performance excludes asset information. The following table shows information by reportable segment for 2023, 2022 and 2021 (in millions): Operating income Operating income Operating income Operating income Operating income A reconciliation of the Company’s segment operating income to the Consolidated Statements of Operations for 2023, 2022 and 2021 is as follows (in millions): Segment operating income Total operating income Includes corporate marketing expenses, certain share-based compensation expenses, various nonrecurring charges, and other separately managed general and administrative costs. The U.S. and China were the only countries that accounted for more than 10% of the Company’s net sales in 2023, 2022 and 2021. Net sales for 2023, 2022 and 2021 and long-lived assets as of September\\xa030, 2023 and September\\xa024, 2022 were as follows (in millions): We have audited the accompanying consolidated balance sheets of Apple Inc. as of September\\xa030, 2023 and September\\xa024, 2022, the related consolidated statements of operations, comprehensive income, shareholders’ equity and cash flows for each of the three years in the period ended September\\xa030, 2023, and the related notes (collectively referred to as the “financial statements”). In our opinion, the financial statements present fairly, in all material respects, the financial position of Apple Inc. at September\\xa030, 2023 and September\\xa024, 2022, and the results of its operations and its cash flows for each of the three years in the period ended September\\xa030, 2023, in conformity with U.S. generally accepted accounting principles. We also have audited, in accordance with the standards of the Public Company Accounting Oversight Board (United States) (the “PCAOB”), Apple Inc.’s internal control over financial reporting as of September\\xa030, 2023, based on criteria established in issued by the Committee of Sponsoring Organizations of the Treadway Commission (2013 framework) and our report dated November\\xa02, 2023 expressed an unqualified opinion thereon. These financial statements are the responsibility of Apple Inc.’s management. Our responsibility is to express an opinion on Apple Inc.’s financial statements based on our audits. We are a public accounting firm registered with the PCAOB and are required to be independent with respect to Apple Inc. in accordance with the U.S. federal securities laws and the applicable rules and regulations of the U.S. Securities and Exchange Commission and the PCAOB. We conducted our audits in accordance with the standards of the PCAOB. Those standards require that we plan and perform the audit to obtain reasonable assurance about whether the financial statements are free of material misstatement, whether due to error or fraud. Our audits included performing procedures to assess the risks of material misstatement of the financial statements, whether due to error or fraud, and performing procedures that respond to those risks. Such procedures included examining, on a test basis, evidence regarding the amounts and disclosures in the financial statements. Our audits also included evaluating the accounting principles used and significant estimates made by management, as well as evaluating the overall presentation of the financial statements. We believe that our audits provide a reasonable basis for our opinion. The critical audit matter communicated below is a matter arising from the current period audit of the financial statements that was communicated or required to be communicated to the audit committee and that: (1)\\xa0relates to accounts or disclosures that are material to the financial statements and (2)\\xa0involved our especially challenging, subjective, or complex judgments. The communication of the critical audit matter does not alter in any way our opinion on the financial statements, taken as a whole, and we are not, by communicating the critical audit matter below, providing a separate opinion on the critical audit matter or on the account or disclosure to which it relates. As discussed in Note 7 to the financial statements, Apple Inc. is subject to taxation and files income tax returns in the U.S. federal jurisdiction and many state and foreign jurisdictions. As of September\\xa030, 2023, the total amount of gross unrecognized tax benefits was $19.5 billion, of which $9.5 billion, if recognized, would impact Apple Inc.’s effective tax rate. In accounting for some of the uncertain tax positions, Apple Inc. uses significant judgment in the interpretation and application of complex domestic and international tax laws.Auditing management’s evaluation of whether an uncertain tax position is more likely than not to be sustained and the measurement of the benefit of various tax positions can be complex, involves significant judgment, and is based on interpretations of tax laws and legal rulings. We tested controls relating to the evaluation of uncertain tax positions, including controls over management’s assessment as to whether tax positions are more likely than not to be sustained, management’s process to measure the benefit of its tax positions, and the development of the related disclosures.To evaluate Apple Inc.’s assessment of which tax positions are more likely than not to be sustained, our audit procedures included, among others, reading and evaluating management’s assumptions and analysis, and, as applicable, Apple Inc.’s communications with taxing authorities, that detailed the basis and technical merits of the uncertain tax positions. We involved our tax subject matter resources in assessing the technical merits of certain of Apple Inc.’s tax positions based on our knowledge of relevant tax laws and experience with related taxing authorities. For certain tax positions, we also received external legal counsel confirmation letters and discussed the matters with external advisors and Apple Inc. tax personnel. In addition, we evaluated Apple Inc.’s disclosure in relation to these matters included in Note 7 to the financial statements. We have served as Apple Inc.’s auditor since 2009. We have audited Apple Inc.’s internal control over financial reporting as of September\\xa030, 2023, based on criteria established in issued by the Committee of Sponsoring Organizations of the Treadway Commission (2013 framework) (the “COSO criteria”). In our opinion, Apple Inc. maintained, in all material respects, effective internal control over financial reporting as of September\\xa030, 2023, based on the COSO criteria. We also have audited, in accordance with the standards of the Public Company Accounting Oversight Board (United States) (the “PCAOB”), the consolidated balance sheets of Apple Inc. as of September\\xa030, 2023 and September\\xa024, 2022, the related consolidated statements of operations, comprehensive income, shareholders’ equity and cash flows for each of the three years in the period ended September\\xa030, 2023, and the related notes and our report dated November\\xa02, 2023 expressed an unqualified opinion thereon. Apple Inc.’s management is responsible for maintaining effective internal control over financial reporting, and for its assessment of the effectiveness of internal control over financial reporting included in the accompanying Management’s Annual Report on Internal Control over Financial Reporting. Our responsibility is to express an opinion on Apple Inc.’s internal control over financial reporting based on our audit. We are a public accounting firm registered with the PCAOB and are required to be independent with respect to Apple Inc. in accordance with the U.S. federal securities laws and the applicable rules and regulations of the U.S. Securities and Exchange Commission and the PCAOB. We conducted our audit in accordance with the standards of the PCAOB. Those standards require that we plan and perform the audit to obtain reasonable assurance about whether effective internal control over financial reporting was maintained in all material respects. Our audit included obtaining an understanding of internal control over financial reporting, assessing the risk that a material weakness exists, testing and evaluating the design and operating effectiveness of internal control based on the assessed risk, and performing such other procedures as we considered necessary in the circumstances. We believe that our audit provides a reasonable basis for our opinion. A company’s internal control over financial reporting is a process designed to provide reasonable assurance regarding the reliability of financial reporting and the preparation of financial statements for external purposes in accordance with U.S. generally accepted accounting principles. A company’s internal control over financial reporting includes those policies and procedures that (1)\\xa0pertain to the maintenance of records that, in reasonable detail, accurately and fairly reflect the transactions and dispositions of the assets of the company; (2)\\xa0provide reasonable assurance that transactions are recorded as necessary to permit preparation of financial statements in accordance with U.S. generally accepted accounting principles, and that receipts and expenditures of the company are being made only in accordance with authorizations of management and directors of the company; and (3)\\xa0provide reasonable assurance regarding prevention or timely detection of unauthorized acquisition, use, or disposition of the company’s assets that could have a material effect on the financial statements. Because of its inherent limitations, internal control over financial reporting may not prevent or detect misstatements. Also, projections of any evaluation of effectiveness to future periods are subject to the risk that controls may become inadequate because of changes in conditions, or that the degree of compliance with the policies or procedures may deteriorate.',\n", + " 'ACCOUNTING_DISAGREEMENTS': '',\n", + " 'CONTROLS_AND_PROCEDURES': 'Based on an evaluation under the supervision and with the participation of the Company’s management, the Company’s principal executive officer and principal financial officer have concluded that the Company’s disclosure controls and procedures as defined in Rules 13a-15(e) and 15d-15(e) under the Exchange Act were effective as of September\\xa030, 2023 to provide reasonable assurance that information required to be disclosed by the Company in reports that it files or submits under the Exchange Act is (i)\\xa0recorded, processed, summarized and reported within the time periods specified in the SEC rules and forms and (ii)\\xa0accumulated and communicated to the Company’s management, including its principal executive officer and principal financial officer, as appropriate to allow timely decisions regarding required disclosure. The Company’s internal control over financial reporting is designed to provide reasonable assurance regarding the reliability of financial reporting and the preparation of financial statements for external purposes in accordance with GAAP. The Company’s internal control over financial reporting includes those policies and procedures that: pertain to the maintenance of records that, in reasonable detail, accurately and fairly reflect the transactions and dispositions of the Company’s assets; provide reasonable assurance that transactions are recorded as necessary to permit preparation of financial statements in accordance with GAAP, and that the Company’s receipts and expenditures are being made only in accordance with authorizations of the Company’s management and directors; and provide reasonable assurance regarding prevention or timely detection of unauthorized acquisition, use, or disposition of the Company’s assets that could have a material effect on the financial statements. Management, including the Company’s Chief Executive Officer and Chief Financial Officer, does not expect that the Company’s internal controls will prevent or detect all errors and all fraud. A control system, no matter how well designed and operated, can provide only reasonable, not absolute, assurance that the objectives of the control system are met. Further, the design of a control system must reflect the fact that there are resource constraints, and the benefits of controls must be considered relative to their costs. Because of the inherent limitations in all control systems, no evaluation of internal controls can provide absolute assurance that all control issues and instances of fraud, if any, have been detected. Also, any evaluation of the effectiveness of controls in future periods are subject to the risk that those internal controls may become inadequate because of changes in business conditions, or that the degree of compliance with the policies or procedures may deteriorate. The Company’s management is responsible for establishing and maintaining adequate internal control over financial reporting (as defined in Rule 13a-15(f) under the Exchange Act). Management conducted an assessment of the effectiveness of the Company’s internal control over financial reporting based on the criteria set forth in Internal Control – Integrated Framework issued by the Committee of Sponsoring Organizations of the Treadway Commission (2013 framework). Based on the Company’s assessment, management has concluded that its internal control over financial reporting was effective as of September\\xa030, 2023 to provide reasonable assurance regarding the reliability of financial reporting and the preparation of financial statements in accordance with GAAP. The Company’s independent registered public accounting firm, Ernst & Young LLP, has issued an audit report on the Company’s internal control over financial reporting, which appears in Part II, Item 8 of this Form 10-K. There were no changes in the Company’s internal control over financial reporting during the fourth quarter of 2023, which were identified in connection with management’s evaluation required by paragraph (d) of Rules 13a-15 and 15d-15 under the Exchange Act, that have materially affected, or are reasonably likely to materially affect, the Company’s internal control over financial reporting.',\n", + " 'FOREIGN_JURISDICTIONS': '',\n", + " 'MANAGEMENT': 'The following discussion should be read in conjunction with the consolidated financial statements and accompanying notes included in Part II, Item 8 of this Form 10-K. This Item generally discusses 2023 and 2022 items and year-to-year comparisons between 2023 and 2022. Discussions of 2021 items and year-to-year comparisons between 2022 and 2021 are not included, and can be found in “Management’s Discussion and Analysis of Financial Condition and Results of Operations” in Part II, Item 7 of the Company’s Annual Report on Form 10-K for the fiscal year ended September\\xa024, 2022. The Company’s fiscal year is the 52- or 53-week period that ends on the last Saturday of September. An additional week is included in the first fiscal quarter every five or six years to realign the Company’s fiscal quarters with calendar quarters, which occurred in the first quarter of 2023. The Company’s fiscal year 2023 spanned 53 weeks, whereas fiscal years 2022 and 2021 spanned 52 weeks each. The Company’s total net sales were $383.3 billion and net income was $97.0 billion during 2023. The Company’s total net sales decreased 3% or $11.0 billion during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar accounted for more than the entire year-over-year decrease in total net sales, which consisted primarily of lower net sales of Mac and iPhone, partially offset by higher net sales of Services. The Company announces new product, service and software offerings at various times during the year. Significant announcements during fiscal year 2023 included the following: iPad and iPad Pro; Next-generation Apple TV 4K; and MLS Season Pass, a Major League Soccer subscription streaming service. MacBook Pro 14”, MacBook Pro 16” and Mac mini; and Second-generation HomePod. MacBook Air 15”, Mac Studio and Mac Pro; Apple Vision Pro™, the Company’s first spatial computer featuring its new visionOS™, expected to be available in early calendar year 2024; and iOS 17, macOS Sonoma, iPadOS 17, tvOS 17 and watchOS 10, updates to the Company’s operating systems. iPhone 15, iPhone 15 Plus, iPhone 15 Pro and iPhone 15 Pro Max; and Apple Watch Series 9 and Apple Watch Ultra 2. In May 2023, the Company announced a new share repurchase program of up to $90 billion and raised its quarterly dividend from $0.23 to $0.24 per share beginning in May 2023. During 2023, the Company repurchased $76.6 billion of its common stock and paid dividends and dividend equivalents of $15.0 billion. Macroeconomic conditions, including inflation, changes in interest rates, and currency fluctuations, have directly and indirectly impacted, and could in the future materially impact, the Company’s results of operations and financial condition. The following table shows net sales by reportable segment for 2023, 2022 and 2021 (dollars in millions): Americas net sales decreased 4% or $7.1 billion during 2023 compared to 2022 due to lower net sales of iPhone and Mac, partially offset by higher net sales of Services. Europe net sales decreased 1% or $824 million during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Europe net sales, which consisted primarily of lower net sales of Mac and Wearables, Home and Accessories, partially offset by higher net sales of iPhone and Services. Greater China net sales decreased 2% or $1.6 billion during 2023 compared to 2022. The weakness in the renminbi relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Greater China net sales, which consisted primarily of lower net sales of Mac and iPhone. Japan net sales decreased 7% or $1.7 billion during 2023 compared to 2022. The weakness in the yen relative to the U.S. dollar accounted for more than the entire year-over-year decrease in Japan net sales, which consisted primarily of lower net sales of iPhone, Wearables, Home and Accessories and Mac. Rest of Asia Pacific net sales increased 1% or $240 million during 2023 compared to 2022. The weakness in foreign currencies relative to the U.S. dollar had a significantly unfavorable year-over-year impact on Rest of Asia Pacific net sales. The net sales increase consisted of higher net sales of iPhone and Services, partially offset by lower net sales of Mac and iPad. The following table shows net sales by category for 2023, 2022 and 2021 (dollars in millions): Products net sales include amortization of the deferred value of unspecified software upgrade rights, which are bundled in the sales price of the respective product. Services net sales include amortization of the deferred value of services bundled in the sales price of certain products. iPhone net sales decreased 2% or $4.9 billion during 2023 compared to 2022 due to lower net sales of non-Pro iPhone models, partially offset by higher net sales of Pro iPhone models. Mac net sales decreased 27% or $10.8 billion during 2023 compared to 2022 due primarily to lower net sales of laptops. iPad net sales decreased 3% or $1.0 billion during 2023 compared to 2022 due primarily to lower net sales of iPad mini and iPad Air, partially offset by the combined net sales of iPad 9th and 10th generation. Wearables, Home and Accessories net sales decreased 3% or $1.4 billion during 2023 compared to 2022 due primarily to lower net sales of Wearables and Accessories. Services net sales increased 9% or $7.1 billion during 2023 compared to 2022 due to higher net sales across all lines of business. Products and Services gross margin and gross margin percentage for 2023, 2022 and 2021 were as follows (dollars in millions): Products gross margin decreased during 2023 compared to 2022 due to the weakness in foreign currencies relative to the U.S. dollar and lower Products volume, partially offset by cost savings and a different Products mix. Products gross margin percentage increased during 2023 compared to 2022 due to cost savings and a different Products mix, partially offset by the weakness in foreign currencies relative to the U.S. dollar and decreased leverage. Services gross margin increased during 2023 compared to 2022 due primarily to higher Services net sales, partially offset by the weakness in foreign currencies relative to the U.S. dollar and higher Services costs. Services gross margin percentage decreased during 2023 compared to 2022 due to higher Services costs and the weakness in foreign currencies relative to the U.S. dollar, partially offset by a different Services mix. The Company’s future gross margins can be impacted by a variety of factors, as discussed in Part I, Item 1A of this Form 10-K under the heading “Risk Factors.” As a result, the Company believes, in general, gross margins will be subject to volatility and downward pressure. Operating expenses for 2023, 2022 and 2021 were as follows (dollars in millions): Selling, general and administrative The year-over-year growth in R&D expense in 2023 was driven primarily by increases in headcount-related expenses. Selling, general and administrative expense was relatively flat in 2023 compared to 2022. Provision for income taxes, effective tax rate and statutory federal income tax rate for 2023, 2022 and 2021 were as follows (dollars in millions): The Company’s effective tax rate for 2023 and 2022 was lower than the statutory federal income tax rate due primarily to a lower effective tax rate on foreign earnings, the impact of the U.S. federal R&D credit, and tax benefits from share-based compensation, partially offset by state income taxes. The Company’s effective tax rate for 2023 was lower compared to 2022 due primarily to a lower effective tax rate on foreign earnings and the impact of U.S. foreign tax credit regulations issued by the U.S. Department of the Treasury in 2022, partially offset by lower tax benefits from share-based compensation. The Company believes its balances of cash, cash equivalents and unrestricted marketable securities, which totaled $148.3\\xa0billion as of September\\xa030, 2023, along with cash generated by ongoing operations and continued access to debt markets, will be sufficient to satisfy its cash requirements and capital return program over the next 12 months and beyond. The Company’s material cash requirements include the following contractual obligations: As of September\\xa030, 2023, the Company had outstanding fixed-rate notes with varying maturities for an aggregate principal amount of $106.6 billion (collectively the “Notes”), with $9.9 billion payable within 12 months. Future interest payments associated with the Notes total $41.1 billion, with $2.9 billion payable within 12 months. The Company also issues unsecured short-term promissory notes pursuant to a commercial paper program. As of September\\xa030, 2023, the Company had $6.0 billion of commercial paper outstanding, all of which was payable within 12 months. The Company has lease arrangements for certain equipment and facilities, including corporate, data center, manufacturing and retail space. As of September\\xa030, 2023, the Company had fixed lease payment obligations of $15.8 billion, with $2.0 billion payable within 12 months. The Company utilizes several outsourcing partners to manufacture subassemblies for the Company’s products and to perform final assembly and testing of finished products. The Company also obtains individual components for its products from a wide variety of individual suppliers. As of September\\xa030, 2023, the Company had manufacturing purchase obligations of $53.1 billion, with $52.9 billion payable within 12 months. The Company’s manufacturing purchase obligations are primarily noncancelable. The Company’s other purchase obligations primarily consist of noncancelable obligations to acquire capital assets, including assets related to product manufacturing, and noncancelable obligations related to supplier arrangements, licensed intellectual property and content, and distribution rights. As of September\\xa030, 2023, the Company had other purchase obligations of $21.9 billion, with $5.6 billion payable within 12 months. As of September\\xa030, 2023, the balance of the deemed repatriation tax payable imposed by the U.S. Tax Cuts and Jobs Act of 2017 (the “Act”) was $22.0\\xa0billion, with $6.5\\xa0billion expected to be paid within 12 months. In addition to its contractual cash requirements, the Company has an authorized share repurchase program. The program does not obligate the Company to acquire a minimum amount of shares. As of September\\xa030, 2023, the Company’s quarterly cash dividend was $0.24 per share. The Company intends to increase its dividend on an annual basis, subject to declaration by the Board of Directors. The preparation of financial statements and related disclosures in conformity with U.S. generally accepted accounting principles (“GAAP”) and the Company’s discussion and analysis of its financial condition and operating results require the Company’s management to make judgments, assumptions and estimates that affect the amounts reported. Note 1, “Summary of Significant Accounting Policies” of the Notes to Consolidated Financial Statements in Part II, Item 8 of this Form 10-K describes the significant accounting policies and methods used in the preparation of the Company’s consolidated financial statements. Management bases its estimates on historical experience and on various other assumptions it believes to be reasonable under the circumstances, the results of which form the basis for making judgments about the carrying values of assets and liabilities. The Company is subject to income taxes in the U.S. and numerous foreign jurisdictions. The evaluation of the Company’s uncertain tax positions involves significant judgment in the interpretation and application of GAAP and complex domestic and international tax laws, including the Act and matters related to the allocation of international taxation rights between countries. Although management believes the Company’s reserves are reasonable, no assurance can be given that the final outcome of these uncertainties will not be different from that which is reflected in the Company’s reserves. Reserves are adjusted considering changing facts and circumstances, such as the closing of a tax examination. Resolution of these uncertainties in a manner inconsistent with management’s expectations could have a material impact on the Company’s financial condition and operating results. The Company is subject to various legal proceedings and claims that arise in the ordinary course of business, the outcomes of which are inherently uncertain. The Company records a liability when it is probable that a loss has been incurred and the amount is reasonably estimable, the determination of which requires significant judgment. Resolution of legal matters in a manner inconsistent with management’s expectations could have a material impact on the Company’s financial condition and operating results.',\n", + " 'COMPENSATION': 'The information required by this Item will be included in the 2024 Proxy Statement, and is incorporated herein by reference.',\n", + " 'PRINCIPAL_STOCKHOLDERS': 'The information required by this Item will be included in the 2024 Proxy Statement, and is incorporated herein by reference.',\n", + " 'RELATED_PARTY_TRANSACTIONS': 'The information required by this Item will be included in the 2024 Proxy Statement, and is incorporated herein by reference.',\n", + " 'ACCOUNTING_FEES': '',\n", + " 'EXHIBITS': '',\n", + " 'FORM_SUMMARY': ''},\n", + " 'filing_type': '10-K'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finnlp-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_Social_Media.ipynb b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_Social_Media.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a346dd8ead1c9a11b44912cac227b1dabf41bd87 --- /dev/null +++ b/FinNLP/docs/FinNLP/docs/jupyter/Data_Sources_Social_Media.ipynb @@ -0,0 +1,2261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../FinNLP\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Eastmoney" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.eastmoney_streaming import Eastmoney_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "stock = \"600519\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "downloader = Eastmoney_Streaming()\n", + "downloader.download_streaming_stock(stock, pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(241, 92)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
post_idpost_titlestockbar_codestockbar_namestockbar_typeuser_iduser_nicknameuser_extendinfospost_click_countpost_forward_count...relate_topiczwpage_flagsource_post_comment_countpost_atuserreply_listcontent_typerepost_statereptile_stateallow_likes_statepost_is_hot
01324058647贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元600519贵州茅台吧100.07344113638256342贵州茅台资讯{'user_accreditinfos': None, 'deactive': '0', ...379914...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

1 rows × 92 columns

\n", + "
" + ], + "text/plain": [ + " post_id post_title stockbar_code \\\n", + "0 1324058647 贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元 600519 \n", + "\n", + " stockbar_name stockbar_type user_id user_nickname \\\n", + "0 贵州茅台吧 100.0 7344113638256342 贵州茅台资讯 \n", + "\n", + " user_extendinfos post_click_count \\\n", + "0 {'user_accreditinfos': None, 'deactive': '0', ... 3799 \n", + "\n", + " post_forward_count ... relate_topic zwpage_flag \\\n", + "0 14 ... NaN NaN \n", + "\n", + " source_post_comment_count post_atuser reply_list content_type \\\n", + "0 NaN NaN NaN NaN \n", + "\n", + " repost_state reptile_state allow_likes_state post_is_hot \n", + "0 NaN NaN NaN NaN \n", + "\n", + "[1 rows x 92 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
post_titleuser_nicknamestockbar_namepost_click_countpost_forward_countpost_comment_countpost_publish_timepost_last_timepost_display_time
0贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元贵州茅台资讯贵州茅台吧379914152023-06-25 22:17:502023-06-26 03:12:472023-06-25 22:17:50
1贵州茅台:贵州茅台2022年年度权益分派实施公告贵州茅台资讯贵州茅台吧642347172023-06-25 15:32:422023-06-26 00:57:392023-06-26 00:00:00
2将派发现金红利325.49亿元!贵州茅台上市以来累计分红超2000亿元贵州茅台资讯贵州茅台吧460102023-06-25 23:49:072023-06-25 23:49:072023-06-25 23:49:07
3茅台冰淇淋悄然卖数亿 年轻市场真被抓住了吗贵州茅台资讯贵州茅台吧261215112023-06-24 07:03:532023-06-25 18:48:212023-06-24 07:03:53
4白酒本周跌5.49%原因是什么?下周怎么看?NaNNaN101974252023-06-24 12:29:532023-06-25 23:12:492023-06-24 12:29:53
5本周持仓与下周交易计划满仓日记财富号评论吧547212023-06-25 20:30:542023-06-26 03:19:082023-06-25 20:30:54
6茅台酒的估值真的是高菩萨小跟班888贵州茅台吧33002023-06-26 03:02:142023-06-26 03:02:142023-06-26 03:02:14
7茅台里面的资金估计要出来支持一些中小微企业政策导向[吃瓜]菩萨小跟班888贵州茅台吧24002023-06-26 01:50:122023-06-26 01:50:122023-06-26 01:50:12
8每股市值收益率,还没有银行定期利息高呢。(远离泡沫浮云地震带)章鱼帝的智慧贵州茅台吧33012023-06-25 22:48:492023-06-26 01:20:042023-06-25 22:48:49
96月最后的倔强(浪潮信息,昆仑万维,鸿博股份)赛道复苏。夏夏爱美丽财富号评论吧24590342023-06-25 22:16:032023-06-26 00:45:532023-06-25 22:16:03
\n", + "
" + ], + "text/plain": [ + " post_title user_nickname stockbar_name \\\n", + "0 贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元 贵州茅台资讯 贵州茅台吧 \n", + "1 贵州茅台:贵州茅台2022年年度权益分派实施公告 贵州茅台资讯 贵州茅台吧 \n", + "2 将派发现金红利325.49亿元!贵州茅台上市以来累计分红超2000亿元 贵州茅台资讯 贵州茅台吧 \n", + "3 茅台冰淇淋悄然卖数亿 年轻市场真被抓住了吗 贵州茅台资讯 贵州茅台吧 \n", + "4 白酒本周跌5.49%原因是什么?下周怎么看? NaN NaN \n", + "5 本周持仓与下周交易计划 满仓日记 财富号评论吧 \n", + "6 茅台酒的估值真的是高 菩萨小跟班888 贵州茅台吧 \n", + "7 茅台里面的资金估计要出来支持一些中小微企业政策导向[吃瓜] 菩萨小跟班888 贵州茅台吧 \n", + "8 每股市值收益率,还没有银行定期利息高呢。(远离泡沫浮云地震带) 章鱼帝的智慧 贵州茅台吧 \n", + "9 6月最后的倔强(浪潮信息,昆仑万维,鸿博股份)赛道复苏。 夏夏爱美丽 财富号评论吧 \n", + "\n", + " post_click_count post_forward_count post_comment_count \\\n", + "0 3799 14 15 \n", + "1 6423 47 17 \n", + "2 460 1 0 \n", + "3 2612 15 11 \n", + "4 10197 4 25 \n", + "5 547 2 1 \n", + "6 33 0 0 \n", + "7 24 0 0 \n", + "8 33 0 1 \n", + "9 2459 0 34 \n", + "\n", + " post_publish_time post_last_time post_display_time \n", + "0 2023-06-25 22:17:50 2023-06-26 03:12:47 2023-06-25 22:17:50 \n", + "1 2023-06-25 15:32:42 2023-06-26 00:57:39 2023-06-26 00:00:00 \n", + "2 2023-06-25 23:49:07 2023-06-25 23:49:07 2023-06-25 23:49:07 \n", + "3 2023-06-24 07:03:53 2023-06-25 18:48:21 2023-06-24 07:03:53 \n", + "4 2023-06-24 12:29:53 2023-06-25 23:12:49 2023-06-24 12:29:53 \n", + "5 2023-06-25 20:30:54 2023-06-26 03:19:08 2023-06-25 20:30:54 \n", + "6 2023-06-26 03:02:14 2023-06-26 03:02:14 2023-06-26 03:02:14 \n", + "7 2023-06-26 01:50:12 2023-06-26 01:50:12 2023-06-26 01:50:12 \n", + "8 2023-06-25 22:48:49 2023-06-26 01:20:04 2023-06-25 22:48:49 \n", + "9 2023-06-25 22:16:03 2023-06-26 00:45:53 2023-06-25 22:16:03 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"post_title\",\"user_nickname\", \"stockbar_name\" ,\"post_click_count\", \"post_forward_count\", \"post_comment_count\", \"post_publish_time\", \"post_last_time\", \"post_display_time\"]\n", + "downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Facebook get cookies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "import json\n", + "\n", + "browser = webdriver.ChromiumEdge()\n", + "browser.get('https://www.facebook.com')\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Please login your account in the brower" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cookies = browser.get_cookies() \n", + "with open(\"cookies.json\", \"w\", encoding=\"utf-8\") as cks:\n", + " json.dump(cookies, cks)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Facebook" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.facebook_streaming import Facebook_Streaming\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# load cookies\n", + "with open(\"cookies.json\", \"r\", encoding=\"utf-8\") as cks: \n", + " cookies = json.load(cks)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "config = {\n", + " \"cookies\":cookies, \n", + " \"headless\": False,\n", + " \"stealth_path\":\"../../FinNLP/finnlp/data_sources/social_media/stealth.min.js\"\n", + " }\n", + "pages = 3\n", + "stock = \"AAPL\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 17/17 [00:57<00:00, 3.37s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "downloader = Facebook_Streaming(config)\n", + "downloader.download_streaming_stock(stock, pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contentdate
6AAPL (Stock Market)4h󰞋󰙷
8Day 7\\nIntroduction to Stock Market\\nWhat you ...6h󰞋󰙷
11US: AAPL new high and breakout from two-year r...1d󰞋󰙷
\n", + "
" + ], + "text/plain": [ + " content date\n", + "6 AAPL (Stock Market) 4h󰞋󰙷\n", + "8 Day 7\\nIntroduction to Stock Market\\nWhat you ... 6h󰞋󰙷\n", + "11 US: AAPL new high and breakout from two-year r... 1d󰞋󰙷" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Xueqiu / 雪球" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.xueqiu_streaming import Xueqiu_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "stock = \"茅台\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "downloader = Xueqiu_Streaming()\n", + "downloader.download_streaming_stock(stock, pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(29, 53)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
blockedblockingcanEditcommentIdcontroversialcreated_atdescriptiondonate_countdonate_snowcoineditable...truncated_bytypeuseruser_idview_countfirstImgpic_sizesedited_atquote_cardssymbol_id
0FalseFalseTrue0False2023-06-25 12:15:07<a href=\"http://xueqiu.com/S/SZ000860\" target=...00True...02{'allow_all_stock': False, 'block_status': 0, ...8364804052471NaNNaNNaNNaNNaN
\n", + "

1 rows × 53 columns

\n", + "
" + ], + "text/plain": [ + " blocked blocking canEdit commentId controversial created_at \\\n", + "0 False False True 0 False 2023-06-25 12:15:07 \n", + "\n", + " description donate_count \\\n", + "0 \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_atdescriptiontitletexttargetsourceuser
02023-06-25 12:15:07<a href=\"http://xueqiu.com/S/SZ000860\" target=...<a href=\"http://xueqiu.com/S/SZ000860\" target=.../8364804052/253976413Android{'allow_all_stock': False, 'block_status': 0, ...
12023-06-25 12:14:22<a href=\"http://xueqiu.com/S/SH600519\" target=...<p><a href=\"http://xueqiu.com/S/SH600519\" targ.../4631817224/253976390雪球{'allow_all_stock': False, 'block_status': 0, ...
22023-06-25 12:13:01...提高。白酒:五粮液、迎驾贡酒、<span class='highlight'>茅台</...6.25 赛道和白马的机会<p>这个假期外围的环境不太好,已经是基本共识了。明天开盘大A承压低开也基本是一致预期。这么.../4322952939/253976335雪球{'allow_all_stock': False, 'block_status': 0, ...
32023-06-25 11:58:55茅台发生活费了茅台发生活费了<br/><img class=\"ke_img\" src=\"https://x.../4653939718/253975764iPhone{'allow_all_stock': False, 'block_status': 0, ...
42023-06-25 11:54:05...业绩及股价,形成正反馈。当年<span class='highlight'>茅台</s...持仓吹票,共同致富<p><a href=\"http://xueqiu.com/k?q=%23%E4%BB%A5.../8113901491/253975613Android{'allow_all_stock': False, 'block_status': 0, ...
52023-06-25 11:50:11微酒酒业快讯,6月25日,酒业新闻一览-·企业动态·-01<span class='high...6.25:<span class='highlight'>茅</span><span cla...<p><img class=\"ke_img\" src=\"https://xqimg.imed.../3615583399/253975485雪球{'allow_all_stock': False, 'block_status': 0, ...
62023-06-25 11:48:42<a href=\"http://xueqiu.com/S/SH603027\" target=...<a href=\"http://xueqiu.com/S/SH603027\" target=.../2659542807/253975430iPhone{'allow_all_stock': False, 'block_status': 0, ...
72023-06-25 11:45:54段永平说:我不鼓励小散投<a href=\"https://xueqiu.com/S/AAPL...段永平说:我不鼓励小散投<a href=\"https://xueqiu.com/S/AAPL.../9456980430/253975338iPhone{'allow_all_stock': False, 'block_status': 0, ...
82023-06-25 11:33:01泸州老窖酒传统酿制技艺第二十三代传承人·国窖1573·曾娜大师鉴藏版,端午举杯小酒。<br/...泸州老窖酒传统酿制技艺第二十三代传承人·国窖1573·曾娜大师鉴藏版,端午举杯小酒。<br/.../9893982765/253974916Android{'allow_all_stock': False, 'block_status': 0, ...
92023-06-25 11:25:44...酒店中,白酒卖得最好的往往不是<span class='highlight'>茅台</...街头没生意的烟酒店,为什么不会倒闭<p><img class=\"ke_img\" src=\"https://xqimg.imed.../5497522856/253974630雪球{'allow_all_stock': False, 'block_status': 0, ...
\n", + "" + ], + "text/plain": [ + " created_at description \\\n", + "0 2023-06-25 12:15:07
茅台茅台茅台这个假期外围的环境不太好,已经是基本共识了。明天开盘大A承压低开也基本是一致预期。这么... /4322952939/253976335 \n", + "3 茅台发生活费了
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idbodycreated_atusersourcesymbolspricesmentioned_usersentitiesliked_by_selfreshared_by_selflinksreshare_messageconversationlikesresharesnetwork
0522005335NANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL...2023-04-07T15:24:22Z{'id': 4744627, 'username': 'JavierAyala', 'na...{'id': 1149, 'title': 'StockTwits for iOS', 'u...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[]{'sentiment': None}FalseFalseNaNNaNNaNNaNNaNNaN
1522004768$AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi...2023-04-07T15:17:43Z{'id': 6330207, 'username': 'PlainFacts_2121',...{'id': 2269, 'title': 'StockTwits Web', 'url':...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[]{'sentiment': None}FalseFalse[{'title': 'China officials who abused health ...NaNNaNNaNNaNNaN
\n", + "" + ], + "text/plain": [ + " id body \\\n", + "0 522005335 NANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL... \n", + "1 522004768 $AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi... \n", + "\n", + " created_at user \\\n", + "0 2023-04-07T15:24:22Z {'id': 4744627, 'username': 'JavierAyala', 'na... \n", + "1 2023-04-07T15:17:43Z {'id': 6330207, 'username': 'PlainFacts_2121',... \n", + "\n", + " source \\\n", + "0 {'id': 1149, 'title': 'StockTwits for iOS', 'u... \n", + "1 {'id': 2269, 'title': 'StockTwits Web', 'url':... \n", + "\n", + " symbols \\\n", + "0 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... \n", + "1 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... \n", + "\n", + " prices mentioned_users \\\n", + "0 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... [] \n", + "1 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... [] \n", + "\n", + " entities liked_by_self reshared_by_self \\\n", + "0 {'sentiment': None} False False \n", + "1 {'sentiment': None} False False \n", + "\n", + " links reshare_message \\\n", + "0 NaN NaN \n", + "1 [{'title': 'China officials who abused health ... NaN \n", + "\n", + " conversation likes reshares network \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_atbody
02023-04-07T15:24:22ZNANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL...
12023-04-07T15:17:43Z$AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi...
22023-04-07T15:17:25Z$AAPL $GOOG $AMZN I took a Trump today. \\n\\nH...
32023-04-07T15:16:54Z$SPY $AAPL will take this baby down, time for ...
42023-04-07T15:11:37Z$SPY $3T it ALREADY DID - look at the pre-COV...
52023-04-07T15:10:29Z$AAPL $QQQ $STUDY We are on to the next one! A...
62023-04-07T15:06:00Z$AAPL was analyzed by 48 analysts. The buy con...
72023-04-07T14:54:29Z$AAPL both retiring. \\n \\nCraig....
82023-04-07T14:40:06Z$SPY $QQQ $TSLA $AAPL SPY 500 HAS STARTED🚀😍 BI...
92023-04-07T14:38:57ZNancy 🩵 (Tim) $AAPL
\n", + "
" + ], + "text/plain": [ + " created_at body\n", + "0 2023-04-07T15:24:22Z NANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL...\n", + "1 2023-04-07T15:17:43Z $AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi...\n", + "2 2023-04-07T15:17:25Z $AAPL $GOOG $AMZN I took a Trump today. \\n\\nH...\n", + "3 2023-04-07T15:16:54Z $SPY $AAPL will take this baby down, time for ...\n", + "4 2023-04-07T15:11:37Z $SPY $3T it ALREADY DID - look at the pre-COV...\n", + "5 2023-04-07T15:10:29Z $AAPL $QQQ $STUDY We are on to the next one! A...\n", + "6 2023-04-07T15:06:00Z $AAPL was analyzed by 48 analysts. The buy con...\n", + "7 2023-04-07T14:54:29Z $AAPL both retiring. \\n \\nCraig....\n", + "8 2023-04-07T14:40:06Z $SPY $QQQ $TSLA $AAPL SPY 500 HAS STARTED🚀😍 BI...\n", + "9 2023-04-07T14:38:57Z Nancy 🩵 (Tim) $AAPL" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"created_at\", \"body\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reddit Wallstreetbets Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.reddit_streaming import Reddit_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "config = {\n", + " # \"use_proxy\": \"us_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 2,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading by pages...: 100%|██████████| 3/3 [00:08<00:00, 2.83s/it]\n" + ] + } + ], + "source": [ + "downloader = Reddit_Streaming(config)\n", + "downloader.download_streaming_all(pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnumCommentscreatedscoredistinguishTypeisLockedisStickiedthumbnailtitleauthor...postEventInfopredictionTournamentreactedFromremovedByremovedByCategorysubredditsuggestedCommentSorttopAwardedTypeurlwhitelistStatus
0t3_12epaq0816808819740000NoneFalseFalse{'url': 'https://b.thumbs.redditmedia.com/W8hd...Y’all making me feel like spoodermanghostwholags...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1t3_zr9v10016715957820002NoneTrueFalse{'url': 'https://b.thumbs.redditmedia.com/dJqb...Do you track your investments in a spreadsheet...sharesight...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

2 rows × 100 columns

\n", + "
" + ], + "text/plain": [ + " id numComments created score distinguishType isLocked \\\n", + "0 t3_12epaq0 8 1680881974000 0 None False \n", + "1 t3_zr9v10 0 1671595782000 2 None True \n", + "\n", + " isStickied thumbnail \\\n", + "0 False {'url': 'https://b.thumbs.redditmedia.com/W8hd... \n", + "1 False {'url': 'https://b.thumbs.redditmedia.com/dJqb... \n", + "\n", + " title author ... \\\n", + "0 Y’all making me feel like spooderman ghostwholags ... \n", + "1 Do you track your investments in a spreadsheet... sharesight ... \n", + "\n", + " postEventInfo predictionTournament reactedFrom removedBy removedByCategory \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + " subreddit suggestedCommentSort topAwardedType url whitelistStatus \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + "[2 rows x 100 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnumCommentscreatedscoredistinguishTypeisLockedisStickiedthumbnailtitleauthor...postEventInfopredictionTournamentreactedFromremovedByremovedByCategorysubredditsuggestedCommentSorttopAwardedTypeurlwhitelistStatus
0t3_12epaq082023-04-07 15:39:340NoneFalseFalse{'url': 'https://b.thumbs.redditmedia.com/W8hd...Y’all making me feel like spoodermanghostwholags...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1t3_zr9v1002022-12-21 04:09:422NoneTrueFalse{'url': 'https://b.thumbs.redditmedia.com/dJqb...Do you track your investments in a spreadsheet...sharesight...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

2 rows × 100 columns

\n", + "
" + ], + "text/plain": [ + " id numComments created score distinguishType isLocked \\\n", + "0 t3_12epaq0 8 2023-04-07 15:39:34 0 None False \n", + "1 t3_zr9v10 0 2022-12-21 04:09:42 2 None True \n", + "\n", + " isStickied thumbnail \\\n", + "0 False {'url': 'https://b.thumbs.redditmedia.com/W8hd... \n", + "1 False {'url': 'https://b.thumbs.redditmedia.com/dJqb... \n", + "\n", + " title author ... \\\n", + "0 Y’all making me feel like spooderman ghostwholags ... \n", + "1 Do you track your investments in a spreadsheet... sharesight ... \n", + "\n", + " postEventInfo predictionTournament reactedFrom removedBy removedByCategory \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + " subreddit suggestedCommentSort topAwardedType url whitelistStatus \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + "[2 rows x 100 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df[\"created\"] = pd.to_datetime(df[\"created\"], unit = \"ms\")\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
createdtitle
02023-04-07 15:39:34Y’all making me feel like spooderman
12022-12-21 04:09:42Do you track your investments in a spreadsheet...
22022-12-21 04:09:42Do you track your investments in a spreadsheet...
32023-04-07 15:29:23Can a Blackberry holder get some help 🥺
42023-04-07 14:49:55The week of CPI and FOMC Minutes… 4-6-23 SPY/ ...
52023-04-07 14:19:22Well let’s hope your job likes you, thanks Jerome
62023-04-07 14:06:32Does anyone else feel an overwhelming sense of...
72023-04-07 13:47:59Watermarked Jesus explains the market being cl...
82023-04-07 13:26:23Jobs report shows 236,000 gain in March. Hot l...
92023-04-07 13:07:15The recession is over! Let's buy more stocks!
\n", + "
" + ], + "text/plain": [ + " created title\n", + "0 2023-04-07 15:39:34 Y’all making me feel like spooderman\n", + "1 2022-12-21 04:09:42 Do you track your investments in a spreadsheet...\n", + "2 2022-12-21 04:09:42 Do you track your investments in a spreadsheet...\n", + "3 2023-04-07 15:29:23 Can a Blackberry holder get some help 🥺\n", + "4 2023-04-07 14:49:55 The week of CPI and FOMC Minutes… 4-6-23 SPY/ ...\n", + "5 2023-04-07 14:19:22 Well let’s hope your job likes you, thanks Jerome\n", + "6 2023-04-07 14:06:32 Does anyone else feel an overwhelming sense of...\n", + "7 2023-04-07 13:47:59 Watermarked Jesus explains the market being cl...\n", + "8 2023-04-07 13:26:23 Jobs report shows 236,000 gain in March. Hot l...\n", + "9 2023-04-07 13:07:15 The recession is over! Let's buy more stocks!" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"created\", \"title\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Weibo Date Range" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.weibo_date_range import Weibo_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2016-01-01\"\n", + "end_date = \"2016-01-02\"\n", + "stock = \"茅台\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + " \"cookies\": \"Your_Login_Cookies\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:09<00:00, 1.95s/it]\n", + "Checking ips: 100%|██████████| 75/75 [01:23<00:00, 1.11s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 13。Usable proxy ips: 13.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading by dates...: 100%|██████████| 2/2 [01:03<00:00, 31.56s/it]\n" + ] + } + ], + "source": [ + "downloader = Weibo_Date_Range(config)\n", + "downloader.download_date_range_stock(start_date, end_date, stock = stock)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datedate_contentsourcecontent
02016-01-012016年01月01日23:41Moto X#舆论之锤#唯品会发声明证实销售假茅台-手机腾讯网O网页链接分享来自浏览器!
22016-01-012016年01月01日22:57新浪博客2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原...
62016-01-012016年01月01日22:56新浪博客2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原...
172016-01-012016年01月01日22:40五蕴皆崆Android开心,今天喝了两斤酒(茅台+扎二)三个人,开心!
182016-01-01NaNNaN一家专卖假货的网站某宝,你该学学了!//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品...
192016-01-01NaNNaN一家专卖假货的网站//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品会售假茅台:供货商...
202016-01-012016年01月01日21:46360安全浏览器前几天说了几点不看好茅台的理由,今年过节喝点茅台支持下,个人口感,茅台比小五好喝,茅台依然是...
212016-01-012016年01月01日21:44华为P8老杜酱酒已到货,从明天起正式在甘肃武威开卖。可以不相信我说的话,但一定不要怀疑@杜子建的为人...
222016-01-012016年01月01日21:24华为Ascend P7【唯品会售假茅台后续:供货商被刑拘顾客获十倍补偿】此前,有网友投诉其在唯品会购买的茅台酒质量...
232016-01-012016年01月01日21:16实得惠省钱网唯品会卖假茅台,供货商被刑拘,买家获十倍补偿8888元|此前,有网友在网络论坛发贴(唯品会宣...
\n", + "
" + ], + "text/plain": [ + " date date_content source \\\n", + "0 2016-01-01 2016年01月01日23:41 Moto X \n", + "2 2016-01-01 2016年01月01日22:57 新浪博客 \n", + "6 2016-01-01 2016年01月01日22:56 新浪博客 \n", + "17 2016-01-01 2016年01月01日22:40 五蕴皆崆Android \n", + "18 2016-01-01 NaN NaN \n", + "19 2016-01-01 NaN NaN \n", + "20 2016-01-01 2016年01月01日21:46 360安全浏览器 \n", + "21 2016-01-01 2016年01月01日21:44 华为P8 \n", + "22 2016-01-01 2016年01月01日21:24 华为Ascend P7 \n", + "23 2016-01-01 2016年01月01日21:16 实得惠省钱网 \n", + "\n", + " content \n", + "0 #舆论之锤#唯品会发声明证实销售假茅台-手机腾讯网O网页链接分享来自浏览器! \n", + "2 2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原... \n", + "6 2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原... \n", + "17 开心,今天喝了两斤酒(茅台+扎二)三个人,开心! \n", + "18 一家专卖假货的网站某宝,你该学学了!//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品... \n", + "19 一家专卖假货的网站//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品会售假茅台:供货商... \n", + "20 前几天说了几点不看好茅台的理由,今年过节喝点茅台支持下,个人口感,茅台比小五好喝,茅台依然是... \n", + "21 老杜酱酒已到货,从明天起正式在甘肃武威开卖。可以不相信我说的话,但一定不要怀疑@杜子建的为人... \n", + "22 【唯品会售假茅台后续:供货商被刑拘顾客获十倍补偿】此前,有网友投诉其在唯品会购买的茅台酒质量... \n", + "23 唯品会卖假茅台,供货商被刑拘,买家获十倍补偿8888元|此前,有网友在网络论坛发贴(唯品会宣... " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df = df.drop_duplicates()\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60, 4)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Weibo Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.weibo_streaming import Weibo_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "rounds = 3\n", + "stock = \"茅台\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + " \"cookies\": \"Your_Login_Cookies\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:09<00:00, 1.98s/it]\n", + "Checking ips: 100%|██████████| 75/75 [01:26<00:00, 1.15s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 19。Usable proxy ips: 19.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing the text content and downloading the full passage...: 100%|██████████| 9/9 [00:00<00:00, 64.89it/s]\n", + "Processing the text content and downloading the full passage...: 100%|██████████| 10/10 [00:09<00:00, 1.07it/s]\n", + "Processing the text content and downloading the full passage...: 100%|██████████| 10/10 [00:02<00:00, 4.93it/s]\n", + "Downloading by page..: 100%|██████████| 3/3 [00:19<00:00, 6.46s/it]\n" + ] + } + ], + "source": [ + "downloader = Weibo_Streaming(config)\n", + "downloader.download_streaming_stock(stock = stock, rounds = rounds)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
card_typedisplay_followbtnmblogitemidactionlogcate_iddisplay_arrowshow_typeschemecontainer_colorcontainer_color_darkcontent_shortcontent
09False{'attitudes_count': 0, 'can_edit': False, 'com...seqid:187118896|type:61|t:|pos:1-0-0|q:茅台|srid...{'act_code': 554, 'ext': 'seqid:187118896|type...3101https://m.weibo.cn/status/MAWMprpPp?mblogid=MA...#EEEEEE#151515事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...
19False{'attitudes_count': 0, 'can_edit': False, 'com...seqid:187118896|type:61|t:|pos:1-0-1|q:茅台|srid...{'act_code': 554, 'ext': 'seqid:187118896|type...3101https://m.weibo.cn/status/MAWHVDm0H?mblogid=MA...#EEEEEE#151515茅台茅台成都收4瓶飞天,自提茅台茅台成都收4瓶飞天,自提
\n", + "
" + ], + "text/plain": [ + " card_type display_followbtn \\\n", + "0 9 False \n", + "1 9 False \n", + "\n", + " mblog \\\n", + "0 {'attitudes_count': 0, 'can_edit': False, 'com... \n", + "1 {'attitudes_count': 0, 'can_edit': False, 'com... \n", + "\n", + " itemid \\\n", + "0 seqid:187118896|type:61|t:|pos:1-0-0|q:茅台|srid... \n", + "1 seqid:187118896|type:61|t:|pos:1-0-1|q:茅台|srid... \n", + "\n", + " actionlog cate_id display_arrow \\\n", + "0 {'act_code': 554, 'ext': 'seqid:187118896|type... 31 0 \n", + "1 {'act_code': 554, 'ext': 'seqid:187118896|type... 31 0 \n", + "\n", + " show_type scheme \\\n", + "0 1 https://m.weibo.cn/status/MAWMprpPp?mblogid=MA... \n", + "1 1 https://m.weibo.cn/status/MAWHVDm0H?mblogid=MA... \n", + "\n", + " container_color container_color_dark \\\n", + "0 #EEEEEE #151515 \n", + "1 #EEEEEE #151515 \n", + "\n", + " content_short \\\n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 \n", + "\n", + " content \n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
content_shortcontent
0事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...
1茅台茅台成都收4瓶飞天,自提茅台茅台成都收4瓶飞天,自提
2我可太喜欢茅台这个防伪了我可太喜欢茅台这个防伪了
3没想到 4S店的二楼 是卖茅台的吧没想到 4S店的二楼 是卖茅台的吧
4买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场
5xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然...xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然...
6茅台 奎屯出一只兔茅茅台 奎屯出一只兔茅
72022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和...2022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和...
841岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+...41岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+...
0吃到了茅台冰激淋也吃到了茅台冰激淋也
\n", + "
" + ], + "text/plain": [ + " content_short \\\n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 \n", + "2 我可太喜欢茅台这个防伪了 \n", + "3 没想到 4S店的二楼 是卖茅台的吧 \n", + "4 买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场 \n", + "5 xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然... \n", + "6 茅台 奎屯出一只兔茅 \n", + "7 2022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和... \n", + "8 41岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+... \n", + "0 吃到了茅台冰激淋也 \n", + "\n", + " content \n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 \n", + "2 我可太喜欢茅台这个防伪了 \n", + "3 没想到 4S店的二楼 是卖茅台的吧 \n", + "4 买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场 \n", + "5 xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然... \n", + "6 茅台 奎屯出一只兔茅 \n", + "7 2022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和... \n", + "8 41岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+... \n", + "0 吃到了茅台冰激淋也 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"content_short\", \"content\"]\n", + "df[selected_columns].head(10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/docs/FinNLP/docs/zh/index.md b/FinNLP/docs/FinNLP/docs/zh/index.md new file mode 100644 index 0000000000000000000000000000000000000000..adb7f7399b7f4087af018eecf40132e7d37e9259 --- /dev/null +++ b/FinNLP/docs/FinNLP/docs/zh/index.md @@ -0,0 +1,127 @@ +# 互联网金融数据 + +演示内容请参见[FinGPT](https://github.com/AI4Finance-Foundation/ChatGPT-for-FinTech) + +**免责声明:我们根据MIT教育许可证的规定共享代码以供学术研究之用。此处不构成任何金融建议,亦非交易真实资金的推荐。在交易或投资之前请使用常识并首先咨询专业人士。** + +## Ⅰ. 架构 + +![image-20230505200244043](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052002139.png) + +* 整个项目由4个部分组成: + + * 第一部分是**数据源**,在这里,我们从互联网上收集历史和流媒体数据。 + + * 接下来,我们将数据推送到**数据工程**部分,在这里我们会对数据进行清洗,标记化处理和提示工程。 + + * 然后,数据被推送到**大语言模型(LLMs)**。在这里,我们可以以不同的方式使用LLMs。我们不仅可以使用收集到的数据来训练我们自己的**轻量级微调模型**,还可以使用这些数据和**训练好的模型**或**LLM API**来支持我们的应用程序。 + + * 最后一部分将是**应用程序**部分,我们可以使用数据和LLMs来制作许多有趣的应用程序。 + +## Ⅱ. 数据源 + +![image-20230505200446477](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052004539.png) + +* 由于空间限制,我们只展示了其中一部分。 + +### 1. [新闻](jupyter/Data_Sources_News.ipynb) + +| 平台 | 数据类型 | 相关市场 | 指定公司 | 时间范围 | 数据源类型 | 限制条件 | 文档数量(万) | 支持情况 | +| :----------------------------------------------------------: | :--------: | :------------: | :----------------------------------------------------------: | :---------------: | :--------: | :-------------------: | ------------------------------------------------------------ | ------------------------------------------------------------ | +| 雅虎 | 金融新闻 | 美国股票 | √ | 时间范围 | 官方 | N/A | 1,500+ | √ | +| 路透社 | 金融新闻 | 美国股票 | × | 时间范围 | 官方 | N/A | 1,500+ | √ | +| 新浪 | 金融新闻 | 中国股票 | × | 时间范围 | 官方 | N/A | 2,000+ | √ | +| 东方财富 | 金融新闻 | 中国股票 | √ | 时间范围 | 官方 | N/A | 1,000+ | √ | +| 第一财经 | 金融新闻 | 中国股票 | √ | 时间范围 | 官方 | N/A | 500+ | 即将 | +| 央视 | 政府新闻 | 中国股票 | × | 时间范围 | 第三方 | N/A | 4 | √ | +| 美国主流媒体 | 金融新闻 | 美国股票 | √ | 时间范围 | 第三方 | 账户 (免费) | 3,200+ | √ | +| 中国主流媒体 | 金融新闻 | 中国股票 | × | 时间范围 | 第三方 | ¥500/年 | 3000+ | √ | + +* FinGPT可能比Bloomberg的文档数目更少,但我们在同一个数量级上。 + +### 2. [社交媒体](jupyter/Data_Sources_Social_Media.iypnb) + +| 平台 | 数据类型 | 相关市场 | 指定公司 | 范围类型 | 来源类型 | 限制 | 文档 (1e4) | 支持 | +| :---------------------: | :------: | :------: | :------: | :------: | :------: | :-----: | ---------- | :--: | +| Twitter | 推文 | 美国股票 | √ | 时间范围 | 官方 | N/A | 18,000+ | √ | +| StockTwits | 推文 | 美国股票 | √ | 最新 | 官方 | N/A | 160,000+ | √ | +| Reddit (wallstreetbets) | 帖子 | 美国股票 | × | 最新 | 官方 | N/A | 9+ | √ | +| 微博 | 推文 | 中国股票 | √ | 时间范围 | 官方 | Cookies | 1,400,000+ | √ | +| 微博 | 推文 | 中国股票 | √ | 最新 | 官方 | N/A | 1,400,000+ | √ | + +* 在 **BloomberGPT** 中,他们**不收集社交媒体数据**,但我们认为**公众舆论是干扰股票市场的最重要因素之一**。 + +### 3. [公司公告](jupyter/Data_Sources_Company_Announcement.ipynb) + +| 平台 | 数据类型 | 相关市场 | 指定公司 | 范围类型 | 数据来源 | 限制 | 文档数 (1e4) | 支持情况 | +| :---------------: | :------: | :------: | :------: | :------: | :------: | :--: | ------------ | :------: | +| 巨潮网 (官方) | 文本 | 中国股票 | √ | 时间范围 | 官方 | N/A | 2,790+ | √ | +| 美国证监会 (官方) | 文本 | 美国股票 | √ | 时间范围 | 官方 | N/A | 1,440+ | √ | + +* 由于我们从不同的股票市场收集数据,因此我们比Bloomberg GPT有更多的申报文档。 + +### 4. 趋势 + +| 平台 | 数据类型 | 相关市场 | 数据源 | 指定公司 | 范围类型 | 源类型 | 限制 | +| :--------------------------------------------------: | :------: | :------: | :-----------------------------------------------------: | :------: | :------: | :----: | :--: | +| [谷歌趋势](https://trends.google.com/trends/explore) | 指数 | 美国股票 | [Google Trends](./finnlp/data_sources/trends/google.py) | √ | 日期范围 | 官方 | N/A | +| [百度指数](https://index.baidu.com/v2/index.html#/) | 指数 | 中国股票 | 即将推出 | - | - | - | - | + + +### 5. 数据集 +| 数据源 | 类型 | 股票 | 日期 | 可用性 | +| :----------------------------------------------------------: | :--: | :--: | :----------------------: | :----: | +| [AShare](https://github.com/JinanZou/Astock) | 新闻 | 3680 | 2018-07-01 到 2021-11-30 | √ | +| [stocknet-dataset](https://github.com/yumoxu/stocknet-dataset) | 推文 | 87 | 2014-01-02 到 2015-12-30 | √ | +| [CHRNN](https://github.com/wuhuizhe/CHRNN) | 推文 | 38 | 2017-01-03 到 2017-12-28 | √ | + +## Ⅲ. 模型 + +![image-20230505200618504](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052006541.png) + +* 在数据中心的自然语言处理领域,我们不需要从头开始训练模型。我们只需要调用API和进行轻量级的微调。 +* 左边是一些可能会用到的LLM APIs,中间是我们可能用来进行微调的模型,右边是一些微调方法。 + +### 1. 微调:Tensor Layers (LoRA) + +![image-20230505200944411](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052009480.png) + +* 在FinGPT中,我们使用新的金融数据集对预训练的LLM进行微调。高质量的标记数据是许多成功的LLM(包括ChatGPT)的最重要的关键之一。 +* 然而,这些高质量的标记数据通常非常昂贵和耗时,并且我们可能需要金融专家的帮助。 +* 如果我们的目标是使用LLM分析与金融相关的文本数据并帮助量化交易,为什么不让市场为我们做标记呢? +* 因此,在这里,我们使用每个新闻相关的股票价格变化百分比作为输出标签,我们使用阈值将标签分成三组(积极的,消极的和中立的),并使用它们和新闻情感的标签。 +* 相应地,在提示工程师部分,我们还要求模型选择其中一个正面的,负面的和中性的作为输出,以便我们充分利用预训练信息。 +* 通过使用LoRA,我们可以将可训练参数减少从6.17B到3.67M。 +* 如表格所示,与chatGLM相比,FinGPT可以在多个指标上实现大幅改善。然而,直接将我们的模型用于量化交易可能是不合适的。由于大多数新闻标题都是中性的,LLMs的大多数原始输出都是中性的,因此LLMs在积极和消极的标签上表现不佳,而这些标签可能对于量化交易是有用的。 +* 然而,在微调之后,我们已经见证了在预测积极和消极标签方面的巨大改进。 +* 这也是为什么该模型可以实现积极的交易结果的原因。 + +### 2. 微调:强化学习在股价上的应用 (RLSP) + +![image-20230505201209946](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052012996.png) + +* 同样地,我们可以使用股价上的强化学习(RLSP)来替换ChatGPT中使用的人类反馈上的强化学习。 + +## Ⅳ. 应用 + +### 1. 智能投顾 + +![image-20230505201913233](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052019296.png) + +* **ChatGPT可以像专业人士一样进行投资建议。** +* 在这个例子中,苹果的**股价上涨**与ChatGPT分析新闻的**预测相符**。 + +### 2. 量化交易 + +![image-20230505201841001](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052018035.png) + +* 我们还可以使用新闻、社交媒体推文或者公司公告来**构建情感因子**,右侧的部分是由Twitter推文和ChatGPT信号产生的交易结果,数据来自于一个称为[stocknet-dataset](https://link.zhihu.com/?target=https%3A//github.com/yumoxu/stocknet-dataset)的数据集。 +* 正如您从图片中所看到的,由ChatGPT生成的交易信号**非常出色**,我们甚至可以**仅通过根据Twitter情感因子交易而获得良好的结果**。 +* 因此,我们可以通过**结合价格因素**来获得更好的结果。 + +### 3. 低代码开发 + +![image-20230505202028292](https://cdn.jsdelivr.net/gh/oliverwang15/imgbed@main/img/202305052020363.png) + +* 我们可以使用LLMs的帮助来编写代码。 +* 右侧显示了我们如何**快速高效地**开发我们的因子和其他代码。 \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/mkdocs.yml b/FinNLP/docs/FinNLP/mkdocs.yml new file mode 100644 index 0000000000000000000000000000000000000000..7649b580d8a4b6553b12504740d9cc8cb8366f02 --- /dev/null +++ b/FinNLP/docs/FinNLP/mkdocs.yml @@ -0,0 +1,44 @@ +site_name: FinGPT & FinNLP +site_author: Oliver Wang, Xiao-yang Liu + +nav: + - Hello World: + - About the project: 'index.md' + + - FinGPT Models: + - FinGPT-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v1' + - FinGPT-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v2' + - FinGPT-v3: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/FinGPT-v3' + + - Robo Advisor: + - chatgpt-robo-advisor-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-robo-advisor-v1' + - chatgpt-robo-advisor-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-robo-advisor-v2' + + - Quantitative Trading: + - chatgpt-trading-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-trading-v1' + - chatgpt-trading-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-trading-v2' + + - Low code development: + - chatgpt-low-code-development-v1: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-low-code-development-v1' + - chatgpt-low-code-development-v2: 'https://github.com/AI4Finance-Foundation/FinGPT/tree/master/fingpt/chatgpt-low-code-development-v2' + + - Data Sources: + - News: jupyter/Data_Sources_News.ipynb + - Social Media: jupyter/Data_Sources_Social_Media.ipynb + - Company Announcement: jupyter/Data_Sources_Company_Announcement.ipynb + +theme: + name: material + +plugins: + - mkdocs-jupyter: + execute: false + +extra: + alternate: + - name: English + link: / + lang: en + - name: 中文 + link: /zh/ + lang: zh \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/site/404.html b/FinNLP/docs/FinNLP/site/404.html new file mode 100644 index 0000000000000000000000000000000000000000..2ea41a3a487d4ab16acbe50da9cf4eecbb5b1780 --- /dev/null +++ b/FinNLP/docs/FinNLP/site/404.html @@ -0,0 +1,629 @@ + + + + + + + + + + + + + + + + + + + + FinGPT & FinNLP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ +

404 - Not found

+ +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/site/assets/images/favicon.png b/FinNLP/docs/FinNLP/site/assets/images/favicon.png new file mode 100644 index 0000000000000000000000000000000000000000..1cf13b9f9d978896599290a74f77d5dbe7d1655c Binary files /dev/null and b/FinNLP/docs/FinNLP/site/assets/images/favicon.png differ diff --git a/FinNLP/docs/FinNLP/site/assets/javascripts/bundle.51198bba.min.js b/FinNLP/docs/FinNLP/site/assets/javascripts/bundle.51198bba.min.js new file mode 100644 index 0000000000000000000000000000000000000000..31bd0414c643071e67a9ee83cd9ef6403d2d6e76 --- /dev/null +++ b/FinNLP/docs/FinNLP/site/assets/javascripts/bundle.51198bba.min.js @@ -0,0 +1,29 @@ +"use strict";(()=>{var Ri=Object.create;var gr=Object.defineProperty;var ki=Object.getOwnPropertyDescriptor;var Hi=Object.getOwnPropertyNames,Ht=Object.getOwnPropertySymbols,Pi=Object.getPrototypeOf,yr=Object.prototype.hasOwnProperty,on=Object.prototype.propertyIsEnumerable;var nn=(e,t,r)=>t in e?gr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))yr.call(t,r)&&nn(e,r,t[r]);if(Ht)for(var r of Ht(t))on.call(t,r)&&nn(e,r,t[r]);return e};var an=(e,t)=>{var r={};for(var n in e)yr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ht)for(var n of Ht(e))t.indexOf(n)<0&&on.call(e,n)&&(r[n]=e[n]);return r};var Pt=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var $i=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of Hi(t))!yr.call(e,o)&&o!==r&&gr(e,o,{get:()=>t[o],enumerable:!(n=ki(t,o))||n.enumerable});return e};var yt=(e,t,r)=>(r=e!=null?Ri(Pi(e)):{},$i(t||!e||!e.__esModule?gr(r,"default",{value:e,enumerable:!0}):r,e));var cn=Pt((xr,sn)=>{(function(e,t){typeof xr=="object"&&typeof sn!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(xr,function(){"use strict";function e(r){var n=!0,o=!1,i=null,s={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function a(T){return!!(T&&T!==document&&T.nodeName!=="HTML"&&T.nodeName!=="BODY"&&"classList"in T&&"contains"in T.classList)}function c(T){var Qe=T.type,De=T.tagName;return!!(De==="INPUT"&&s[Qe]&&!T.readOnly||De==="TEXTAREA"&&!T.readOnly||T.isContentEditable)}function f(T){T.classList.contains("focus-visible")||(T.classList.add("focus-visible"),T.setAttribute("data-focus-visible-added",""))}function u(T){T.hasAttribute("data-focus-visible-added")&&(T.classList.remove("focus-visible"),T.removeAttribute("data-focus-visible-added"))}function p(T){T.metaKey||T.altKey||T.ctrlKey||(a(r.activeElement)&&f(r.activeElement),n=!0)}function m(T){n=!1}function d(T){a(T.target)&&(n||c(T.target))&&f(T.target)}function h(T){a(T.target)&&(T.target.classList.contains("focus-visible")||T.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),u(T.target))}function v(T){document.visibilityState==="hidden"&&(o&&(n=!0),G())}function G(){document.addEventListener("mousemove",N),document.addEventListener("mousedown",N),document.addEventListener("mouseup",N),document.addEventListener("pointermove",N),document.addEventListener("pointerdown",N),document.addEventListener("pointerup",N),document.addEventListener("touchmove",N),document.addEventListener("touchstart",N),document.addEventListener("touchend",N)}function oe(){document.removeEventListener("mousemove",N),document.removeEventListener("mousedown",N),document.removeEventListener("mouseup",N),document.removeEventListener("pointermove",N),document.removeEventListener("pointerdown",N),document.removeEventListener("pointerup",N),document.removeEventListener("touchmove",N),document.removeEventListener("touchstart",N),document.removeEventListener("touchend",N)}function N(T){T.target.nodeName&&T.target.nodeName.toLowerCase()==="html"||(n=!1,oe())}document.addEventListener("keydown",p,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",v,!0),G(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var fn=Pt(Er=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(f){return!1}},r=t(),n=function(f){var u={next:function(){var p=f.shift();return{done:p===void 0,value:p}}};return r&&(u[Symbol.iterator]=function(){return u}),u},o=function(f){return encodeURIComponent(f).replace(/%20/g,"+")},i=function(f){return decodeURIComponent(String(f).replace(/\+/g," "))},s=function(){var f=function(p){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof p;if(m!=="undefined")if(m==="string")p!==""&&this._fromString(p);else if(p instanceof f){var d=this;p.forEach(function(oe,N){d.append(N,oe)})}else if(p!==null&&m==="object")if(Object.prototype.toString.call(p)==="[object Array]")for(var h=0;hd[0]?1:0}),f._entries&&(f._entries={});for(var p=0;p1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(c,f){typeof c!="string"&&(c=String(c)),f&&typeof f!="string"&&(f=String(f));var u=document,p;if(f&&(e.location===void 0||f!==e.location.href)){f=f.toLowerCase(),u=document.implementation.createHTMLDocument(""),p=u.createElement("base"),p.href=f,u.head.appendChild(p);try{if(p.href.indexOf(f)!==0)throw new Error(p.href)}catch(T){throw new Error("URL unable to set base "+f+" due to "+T)}}var m=u.createElement("a");m.href=c,p&&(u.body.appendChild(m),m.href=m.href);var d=u.createElement("input");if(d.type="url",d.value=c,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!f)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),v=!0,G=!0,oe=this;["append","delete","set"].forEach(function(T){var Qe=h[T];h[T]=function(){Qe.apply(h,arguments),v&&(G=!1,oe.search=h.toString(),G=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var N=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==N&&(N=this.search,G&&(v=!1,this.searchParams._fromString(this.search),v=!0))}})},s=i.prototype,a=function(c){Object.defineProperty(s,c,{get:function(){return this._anchorElement[c]},set:function(f){this._anchorElement[c]=f},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(c){a(c)}),Object.defineProperty(s,"search",{get:function(){return this._anchorElement.search},set:function(c){this._anchorElement.search=c,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(s,{toString:{get:function(){var c=this;return function(){return c.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(c){this._anchorElement.href=c,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(c){this._anchorElement.pathname=c},enumerable:!0},origin:{get:function(){var c={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],f=this._anchorElement.port!=c&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(f?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(c){},enumerable:!0},username:{get:function(){return""},set:function(c){},enumerable:!0}}),i.createObjectURL=function(c){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(c){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Er)});var Kr=Pt((Mt,qr)=>{/*! + * clipboard.js v2.0.11 + * https://clipboardjs.com/ + * + * Licensed MIT © Zeno Rocha + */(function(t,r){typeof Mt=="object"&&typeof qr=="object"?qr.exports=r():typeof define=="function"&&define.amd?define([],r):typeof Mt=="object"?Mt.ClipboardJS=r():t.ClipboardJS=r()})(Mt,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return Ci}});var s=i(279),a=i.n(s),c=i(370),f=i.n(c),u=i(817),p=i.n(u);function m(j){try{return document.execCommand(j)}catch(O){return!1}}var d=function(O){var E=p()(O);return m("cut"),E},h=d;function v(j){var O=document.documentElement.getAttribute("dir")==="rtl",E=document.createElement("textarea");E.style.fontSize="12pt",E.style.border="0",E.style.padding="0",E.style.margin="0",E.style.position="absolute",E.style[O?"right":"left"]="-9999px";var H=window.pageYOffset||document.documentElement.scrollTop;return E.style.top="".concat(H,"px"),E.setAttribute("readonly",""),E.value=j,E}var G=function(O,E){var H=v(O);E.container.appendChild(H);var I=p()(H);return m("copy"),H.remove(),I},oe=function(O){var E=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},H="";return typeof O=="string"?H=G(O,E):O instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(O==null?void 0:O.type)?H=G(O.value,E):(H=p()(O),m("copy")),H},N=oe;function T(j){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?T=function(E){return typeof E}:T=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},T(j)}var Qe=function(){var O=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},E=O.action,H=E===void 0?"copy":E,I=O.container,q=O.target,Me=O.text;if(H!=="copy"&&H!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(q!==void 0)if(q&&T(q)==="object"&&q.nodeType===1){if(H==="copy"&&q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(H==="cut"&&(q.hasAttribute("readonly")||q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Me)return N(Me,{container:I});if(q)return H==="cut"?h(q):N(q,{container:I})},De=Qe;function $e(j){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?$e=function(E){return typeof E}:$e=function(E){return E&&typeof Symbol=="function"&&E.constructor===Symbol&&E!==Symbol.prototype?"symbol":typeof E},$e(j)}function wi(j,O){if(!(j instanceof O))throw new TypeError("Cannot call a class as a function")}function rn(j,O){for(var E=0;E0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof I.action=="function"?I.action:this.defaultAction,this.target=typeof I.target=="function"?I.target:this.defaultTarget,this.text=typeof I.text=="function"?I.text:this.defaultText,this.container=$e(I.container)==="object"?I.container:document.body}},{key:"listenClick",value:function(I){var q=this;this.listener=f()(I,"click",function(Me){return q.onClick(Me)})}},{key:"onClick",value:function(I){var q=I.delegateTarget||I.currentTarget,Me=this.action(q)||"copy",kt=De({action:Me,container:this.container,target:this.target(q),text:this.text(q)});this.emit(kt?"success":"error",{action:Me,text:kt,trigger:q,clearSelection:function(){q&&q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(I){return vr("action",I)}},{key:"defaultTarget",value:function(I){var q=vr("target",I);if(q)return document.querySelector(q)}},{key:"defaultText",value:function(I){return vr("text",I)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(I){var q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return N(I,q)}},{key:"cut",value:function(I){return h(I)}},{key:"isSupported",value:function(){var I=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],q=typeof I=="string"?[I]:I,Me=!!document.queryCommandSupported;return q.forEach(function(kt){Me=Me&&!!document.queryCommandSupported(kt)}),Me}}]),E}(a()),Ci=Ai},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function s(a,c){for(;a&&a.nodeType!==o;){if(typeof a.matches=="function"&&a.matches(c))return a;a=a.parentNode}}n.exports=s},438:function(n,o,i){var s=i(828);function a(u,p,m,d,h){var v=f.apply(this,arguments);return u.addEventListener(m,v,h),{destroy:function(){u.removeEventListener(m,v,h)}}}function c(u,p,m,d,h){return typeof u.addEventListener=="function"?a.apply(null,arguments):typeof m=="function"?a.bind(null,document).apply(null,arguments):(typeof u=="string"&&(u=document.querySelectorAll(u)),Array.prototype.map.call(u,function(v){return a(v,p,m,d,h)}))}function f(u,p,m,d){return function(h){h.delegateTarget=s(h.target,p),h.delegateTarget&&d.call(u,h)}}n.exports=c},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var s=Object.prototype.toString.call(i);return i!==void 0&&(s==="[object NodeList]"||s==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var s=Object.prototype.toString.call(i);return s==="[object Function]"}},370:function(n,o,i){var s=i(879),a=i(438);function c(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!s.string(d))throw new TypeError("Second argument must be a String");if(!s.fn(h))throw new TypeError("Third argument must be a Function");if(s.node(m))return f(m,d,h);if(s.nodeList(m))return u(m,d,h);if(s.string(m))return p(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function f(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function u(m,d,h){return Array.prototype.forEach.call(m,function(v){v.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(v){v.removeEventListener(d,h)})}}}function p(m,d,h){return a(document.body,m,d,h)}n.exports=c},817:function(n){function o(i){var s;if(i.nodeName==="SELECT")i.focus(),s=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var a=i.hasAttribute("readonly");a||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),a||i.removeAttribute("readonly"),s=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var c=window.getSelection(),f=document.createRange();f.selectNodeContents(i),c.removeAllRanges(),c.addRange(f),s=c.toString()}return s}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,s,a){var c=this.e||(this.e={});return(c[i]||(c[i]=[])).push({fn:s,ctx:a}),this},once:function(i,s,a){var c=this;function f(){c.off(i,f),s.apply(a,arguments)}return f._=s,this.on(i,f,a)},emit:function(i){var s=[].slice.call(arguments,1),a=((this.e||(this.e={}))[i]||[]).slice(),c=0,f=a.length;for(c;c{"use strict";/*! + * escape-html + * Copyright(c) 2012-2013 TJ Holowaychuk + * Copyright(c) 2015 Andreas Lubbe + * Copyright(c) 2015 Tiancheng "Timothy" Gu + * MIT Licensed + */var ns=/["'&<>]/;Go.exports=os;function os(e){var t=""+e,r=ns.exec(t);if(!r)return t;var n,o="",i=0,s=0;for(i=r.index;i0&&i[i.length-1])&&(f[0]===6||f[0]===2)){r=0;continue}if(f[0]===3&&(!i||f[1]>i[0]&&f[1]=e.length&&(e=void 0),{value:e&&e[n++],done:!e}}};throw new TypeError(t?"Object is not iterable.":"Symbol.iterator is not defined.")}function W(e,t){var r=typeof Symbol=="function"&&e[Symbol.iterator];if(!r)return e;var n=r.call(e),o,i=[],s;try{for(;(t===void 0||t-- >0)&&!(o=n.next()).done;)i.push(o.value)}catch(a){s={error:a}}finally{try{o&&!o.done&&(r=n.return)&&r.call(n)}finally{if(s)throw s.error}}return i}function D(e,t,r){if(r||arguments.length===2)for(var n=0,o=t.length,i;n1||a(m,d)})})}function a(m,d){try{c(n[m](d))}catch(h){p(i[0][3],h)}}function c(m){m.value instanceof et?Promise.resolve(m.value.v).then(f,u):p(i[0][2],m)}function f(m){a("next",m)}function u(m){a("throw",m)}function p(m,d){m(d),i.shift(),i.length&&a(i[0][0],i[0][1])}}function ln(e){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var t=e[Symbol.asyncIterator],r;return t?t.call(e):(e=typeof Ee=="function"?Ee(e):e[Symbol.iterator](),r={},n("next"),n("throw"),n("return"),r[Symbol.asyncIterator]=function(){return this},r);function n(i){r[i]=e[i]&&function(s){return new Promise(function(a,c){s=e[i](s),o(a,c,s.done,s.value)})}}function o(i,s,a,c){Promise.resolve(c).then(function(f){i({value:f,done:a})},s)}}function C(e){return typeof e=="function"}function at(e){var t=function(n){Error.call(n),n.stack=new Error().stack},r=e(t);return r.prototype=Object.create(Error.prototype),r.prototype.constructor=r,r}var It=at(function(e){return function(r){e(this),this.message=r?r.length+` errors occurred during unsubscription: +`+r.map(function(n,o){return o+1+") "+n.toString()}).join(` + `):"",this.name="UnsubscriptionError",this.errors=r}});function Ve(e,t){if(e){var r=e.indexOf(t);0<=r&&e.splice(r,1)}}var Ie=function(){function e(t){this.initialTeardown=t,this.closed=!1,this._parentage=null,this._finalizers=null}return e.prototype.unsubscribe=function(){var t,r,n,o,i;if(!this.closed){this.closed=!0;var s=this._parentage;if(s)if(this._parentage=null,Array.isArray(s))try{for(var a=Ee(s),c=a.next();!c.done;c=a.next()){var f=c.value;f.remove(this)}}catch(v){t={error:v}}finally{try{c&&!c.done&&(r=a.return)&&r.call(a)}finally{if(t)throw t.error}}else s.remove(this);var u=this.initialTeardown;if(C(u))try{u()}catch(v){i=v instanceof It?v.errors:[v]}var p=this._finalizers;if(p){this._finalizers=null;try{for(var m=Ee(p),d=m.next();!d.done;d=m.next()){var h=d.value;try{mn(h)}catch(v){i=i!=null?i:[],v instanceof It?i=D(D([],W(i)),W(v.errors)):i.push(v)}}}catch(v){n={error:v}}finally{try{d&&!d.done&&(o=m.return)&&o.call(m)}finally{if(n)throw n.error}}}if(i)throw new It(i)}},e.prototype.add=function(t){var r;if(t&&t!==this)if(this.closed)mn(t);else{if(t instanceof e){if(t.closed||t._hasParent(this))return;t._addParent(this)}(this._finalizers=(r=this._finalizers)!==null&&r!==void 0?r:[]).push(t)}},e.prototype._hasParent=function(t){var r=this._parentage;return r===t||Array.isArray(r)&&r.includes(t)},e.prototype._addParent=function(t){var r=this._parentage;this._parentage=Array.isArray(r)?(r.push(t),r):r?[r,t]:t},e.prototype._removeParent=function(t){var r=this._parentage;r===t?this._parentage=null:Array.isArray(r)&&Ve(r,t)},e.prototype.remove=function(t){var r=this._finalizers;r&&Ve(r,t),t instanceof e&&t._removeParent(this)},e.EMPTY=function(){var t=new e;return t.closed=!0,t}(),e}();var Sr=Ie.EMPTY;function jt(e){return e instanceof Ie||e&&"closed"in e&&C(e.remove)&&C(e.add)&&C(e.unsubscribe)}function mn(e){C(e)?e():e.unsubscribe()}var Le={onUnhandledError:null,onStoppedNotification:null,Promise:void 0,useDeprecatedSynchronousErrorHandling:!1,useDeprecatedNextContext:!1};var st={setTimeout:function(e,t){for(var r=[],n=2;n0},enumerable:!1,configurable:!0}),t.prototype._trySubscribe=function(r){return this._throwIfClosed(),e.prototype._trySubscribe.call(this,r)},t.prototype._subscribe=function(r){return this._throwIfClosed(),this._checkFinalizedStatuses(r),this._innerSubscribe(r)},t.prototype._innerSubscribe=function(r){var n=this,o=this,i=o.hasError,s=o.isStopped,a=o.observers;return i||s?Sr:(this.currentObservers=null,a.push(r),new Ie(function(){n.currentObservers=null,Ve(a,r)}))},t.prototype._checkFinalizedStatuses=function(r){var n=this,o=n.hasError,i=n.thrownError,s=n.isStopped;o?r.error(i):s&&r.complete()},t.prototype.asObservable=function(){var r=new F;return r.source=this,r},t.create=function(r,n){return new En(r,n)},t}(F);var En=function(e){ie(t,e);function t(r,n){var o=e.call(this)||this;return o.destination=r,o.source=n,o}return t.prototype.next=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.next)===null||o===void 0||o.call(n,r)},t.prototype.error=function(r){var n,o;(o=(n=this.destination)===null||n===void 0?void 0:n.error)===null||o===void 0||o.call(n,r)},t.prototype.complete=function(){var r,n;(n=(r=this.destination)===null||r===void 0?void 0:r.complete)===null||n===void 0||n.call(r)},t.prototype._subscribe=function(r){var n,o;return(o=(n=this.source)===null||n===void 0?void 0:n.subscribe(r))!==null&&o!==void 0?o:Sr},t}(x);var Et={now:function(){return(Et.delegate||Date).now()},delegate:void 0};var wt=function(e){ie(t,e);function t(r,n,o){r===void 0&&(r=1/0),n===void 0&&(n=1/0),o===void 0&&(o=Et);var i=e.call(this)||this;return i._bufferSize=r,i._windowTime=n,i._timestampProvider=o,i._buffer=[],i._infiniteTimeWindow=!0,i._infiniteTimeWindow=n===1/0,i._bufferSize=Math.max(1,r),i._windowTime=Math.max(1,n),i}return t.prototype.next=function(r){var n=this,o=n.isStopped,i=n._buffer,s=n._infiniteTimeWindow,a=n._timestampProvider,c=n._windowTime;o||(i.push(r),!s&&i.push(a.now()+c)),this._trimBuffer(),e.prototype.next.call(this,r)},t.prototype._subscribe=function(r){this._throwIfClosed(),this._trimBuffer();for(var n=this._innerSubscribe(r),o=this,i=o._infiniteTimeWindow,s=o._buffer,a=s.slice(),c=0;c0?e.prototype.requestAsyncId.call(this,r,n,o):(r.actions.push(this),r._scheduled||(r._scheduled=ut.requestAnimationFrame(function(){return r.flush(void 0)})))},t.prototype.recycleAsyncId=function(r,n,o){var i;if(o===void 0&&(o=0),o!=null?o>0:this.delay>0)return e.prototype.recycleAsyncId.call(this,r,n,o);var s=r.actions;n!=null&&((i=s[s.length-1])===null||i===void 0?void 0:i.id)!==n&&(ut.cancelAnimationFrame(n),r._scheduled=void 0)},t}(Wt);var Tn=function(e){ie(t,e);function t(){return e!==null&&e.apply(this,arguments)||this}return t.prototype.flush=function(r){this._active=!0;var n=this._scheduled;this._scheduled=void 0;var o=this.actions,i;r=r||o.shift();do if(i=r.execute(r.state,r.delay))break;while((r=o[0])&&r.id===n&&o.shift());if(this._active=!1,i){for(;(r=o[0])&&r.id===n&&o.shift();)r.unsubscribe();throw i}},t}(Dt);var Te=new Tn(Sn);var _=new F(function(e){return e.complete()});function Vt(e){return e&&C(e.schedule)}function Cr(e){return e[e.length-1]}function Ye(e){return C(Cr(e))?e.pop():void 0}function Oe(e){return Vt(Cr(e))?e.pop():void 0}function zt(e,t){return typeof Cr(e)=="number"?e.pop():t}var pt=function(e){return e&&typeof e.length=="number"&&typeof e!="function"};function Nt(e){return C(e==null?void 0:e.then)}function qt(e){return C(e[ft])}function Kt(e){return Symbol.asyncIterator&&C(e==null?void 0:e[Symbol.asyncIterator])}function Qt(e){return new TypeError("You provided "+(e!==null&&typeof e=="object"?"an invalid object":"'"+e+"'")+" where a stream was expected. You can provide an Observable, Promise, ReadableStream, Array, AsyncIterable, or Iterable.")}function Ni(){return typeof Symbol!="function"||!Symbol.iterator?"@@iterator":Symbol.iterator}var Yt=Ni();function Gt(e){return C(e==null?void 0:e[Yt])}function Bt(e){return pn(this,arguments,function(){var r,n,o,i;return $t(this,function(s){switch(s.label){case 0:r=e.getReader(),s.label=1;case 1:s.trys.push([1,,9,10]),s.label=2;case 2:return[4,et(r.read())];case 3:return n=s.sent(),o=n.value,i=n.done,i?[4,et(void 0)]:[3,5];case 4:return[2,s.sent()];case 5:return[4,et(o)];case 6:return[4,s.sent()];case 7:return s.sent(),[3,2];case 8:return[3,10];case 9:return r.releaseLock(),[7];case 10:return[2]}})})}function Jt(e){return C(e==null?void 0:e.getReader)}function U(e){if(e instanceof F)return e;if(e!=null){if(qt(e))return qi(e);if(pt(e))return Ki(e);if(Nt(e))return Qi(e);if(Kt(e))return On(e);if(Gt(e))return Yi(e);if(Jt(e))return Gi(e)}throw Qt(e)}function qi(e){return new F(function(t){var r=e[ft]();if(C(r.subscribe))return r.subscribe(t);throw new TypeError("Provided object does not correctly implement Symbol.observable")})}function Ki(e){return new F(function(t){for(var r=0;r=2;return function(n){return n.pipe(e?A(function(o,i){return e(o,i,n)}):de,ge(1),r?He(t):Vn(function(){return new Zt}))}}function zn(){for(var e=[],t=0;t=2,!0))}function pe(e){e===void 0&&(e={});var t=e.connector,r=t===void 0?function(){return new x}:t,n=e.resetOnError,o=n===void 0?!0:n,i=e.resetOnComplete,s=i===void 0?!0:i,a=e.resetOnRefCountZero,c=a===void 0?!0:a;return function(f){var u,p,m,d=0,h=!1,v=!1,G=function(){p==null||p.unsubscribe(),p=void 0},oe=function(){G(),u=m=void 0,h=v=!1},N=function(){var T=u;oe(),T==null||T.unsubscribe()};return y(function(T,Qe){d++,!v&&!h&&G();var De=m=m!=null?m:r();Qe.add(function(){d--,d===0&&!v&&!h&&(p=$r(N,c))}),De.subscribe(Qe),!u&&d>0&&(u=new rt({next:function($e){return De.next($e)},error:function($e){v=!0,G(),p=$r(oe,o,$e),De.error($e)},complete:function(){h=!0,G(),p=$r(oe,s),De.complete()}}),U(T).subscribe(u))})(f)}}function $r(e,t){for(var r=[],n=2;ne.next(document)),e}function K(e,t=document){return Array.from(t.querySelectorAll(e))}function z(e,t=document){let r=ce(e,t);if(typeof r=="undefined")throw new ReferenceError(`Missing element: expected "${e}" to be present`);return r}function ce(e,t=document){return t.querySelector(e)||void 0}function _e(){return document.activeElement instanceof HTMLElement&&document.activeElement||void 0}function tr(e){return L(b(document.body,"focusin"),b(document.body,"focusout")).pipe(ke(1),l(()=>{let t=_e();return typeof t!="undefined"?e.contains(t):!1}),V(e===_e()),B())}function Xe(e){return{x:e.offsetLeft,y:e.offsetTop}}function Qn(e){return L(b(window,"load"),b(window,"resize")).pipe(Ce(0,Te),l(()=>Xe(e)),V(Xe(e)))}function rr(e){return{x:e.scrollLeft,y:e.scrollTop}}function dt(e){return L(b(e,"scroll"),b(window,"resize")).pipe(Ce(0,Te),l(()=>rr(e)),V(rr(e)))}var Gn=function(){if(typeof Map!="undefined")return Map;function e(t,r){var n=-1;return t.some(function(o,i){return o[0]===r?(n=i,!0):!1}),n}return function(){function t(){this.__entries__=[]}return Object.defineProperty(t.prototype,"size",{get:function(){return this.__entries__.length},enumerable:!0,configurable:!0}),t.prototype.get=function(r){var n=e(this.__entries__,r),o=this.__entries__[n];return o&&o[1]},t.prototype.set=function(r,n){var o=e(this.__entries__,r);~o?this.__entries__[o][1]=n:this.__entries__.push([r,n])},t.prototype.delete=function(r){var n=this.__entries__,o=e(n,r);~o&&n.splice(o,1)},t.prototype.has=function(r){return!!~e(this.__entries__,r)},t.prototype.clear=function(){this.__entries__.splice(0)},t.prototype.forEach=function(r,n){n===void 0&&(n=null);for(var o=0,i=this.__entries__;o0},e.prototype.connect_=function(){!Dr||this.connected_||(document.addEventListener("transitionend",this.onTransitionEnd_),window.addEventListener("resize",this.refresh),ga?(this.mutationsObserver_=new MutationObserver(this.refresh),this.mutationsObserver_.observe(document,{attributes:!0,childList:!0,characterData:!0,subtree:!0})):(document.addEventListener("DOMSubtreeModified",this.refresh),this.mutationEventsAdded_=!0),this.connected_=!0)},e.prototype.disconnect_=function(){!Dr||!this.connected_||(document.removeEventListener("transitionend",this.onTransitionEnd_),window.removeEventListener("resize",this.refresh),this.mutationsObserver_&&this.mutationsObserver_.disconnect(),this.mutationEventsAdded_&&document.removeEventListener("DOMSubtreeModified",this.refresh),this.mutationsObserver_=null,this.mutationEventsAdded_=!1,this.connected_=!1)},e.prototype.onTransitionEnd_=function(t){var r=t.propertyName,n=r===void 0?"":r,o=va.some(function(i){return!!~n.indexOf(i)});o&&this.refresh()},e.getInstance=function(){return this.instance_||(this.instance_=new e),this.instance_},e.instance_=null,e}(),Bn=function(e,t){for(var r=0,n=Object.keys(t);r0},e}(),Xn=typeof WeakMap!="undefined"?new WeakMap:new Gn,Zn=function(){function e(t){if(!(this instanceof e))throw new TypeError("Cannot call a class as a function.");if(!arguments.length)throw new TypeError("1 argument required, but only 0 present.");var r=ya.getInstance(),n=new Aa(t,r,this);Xn.set(this,n)}return e}();["observe","unobserve","disconnect"].forEach(function(e){Zn.prototype[e]=function(){var t;return(t=Xn.get(this))[e].apply(t,arguments)}});var Ca=function(){return typeof nr.ResizeObserver!="undefined"?nr.ResizeObserver:Zn}(),eo=Ca;var to=new x,Ra=$(()=>k(new eo(e=>{for(let t of e)to.next(t)}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),J(1));function he(e){return{width:e.offsetWidth,height:e.offsetHeight}}function ye(e){return Ra.pipe(S(t=>t.observe(e)),g(t=>to.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(()=>he(e)))),V(he(e)))}function bt(e){return{width:e.scrollWidth,height:e.scrollHeight}}function ar(e){let t=e.parentElement;for(;t&&(e.scrollWidth<=t.scrollWidth&&e.scrollHeight<=t.scrollHeight);)t=(e=t).parentElement;return t?e:void 0}var ro=new x,ka=$(()=>k(new IntersectionObserver(e=>{for(let t of e)ro.next(t)},{threshold:0}))).pipe(g(e=>L(ze,k(e)).pipe(R(()=>e.disconnect()))),J(1));function sr(e){return ka.pipe(S(t=>t.observe(e)),g(t=>ro.pipe(A(({target:r})=>r===e),R(()=>t.unobserve(e)),l(({isIntersecting:r})=>r))))}function no(e,t=16){return dt(e).pipe(l(({y:r})=>{let n=he(e),o=bt(e);return r>=o.height-n.height-t}),B())}var cr={drawer:z("[data-md-toggle=drawer]"),search:z("[data-md-toggle=search]")};function oo(e){return cr[e].checked}function Ke(e,t){cr[e].checked!==t&&cr[e].click()}function Ue(e){let t=cr[e];return b(t,"change").pipe(l(()=>t.checked),V(t.checked))}function Ha(e,t){switch(e.constructor){case HTMLInputElement:return e.type==="radio"?/^Arrow/.test(t):!0;case HTMLSelectElement:case HTMLTextAreaElement:return!0;default:return e.isContentEditable}}function Pa(){return L(b(window,"compositionstart").pipe(l(()=>!0)),b(window,"compositionend").pipe(l(()=>!1))).pipe(V(!1))}function io(){let e=b(window,"keydown").pipe(A(t=>!(t.metaKey||t.ctrlKey)),l(t=>({mode:oo("search")?"search":"global",type:t.key,claim(){t.preventDefault(),t.stopPropagation()}})),A(({mode:t,type:r})=>{if(t==="global"){let n=_e();if(typeof n!="undefined")return!Ha(n,r)}return!0}),pe());return Pa().pipe(g(t=>t?_:e))}function le(){return new URL(location.href)}function ot(e){location.href=e.href}function ao(){return new x}function so(e,t){if(typeof t=="string"||typeof t=="number")e.innerHTML+=t.toString();else if(t instanceof Node)e.appendChild(t);else if(Array.isArray(t))for(let r of t)so(e,r)}function M(e,t,...r){let n=document.createElement(e);if(t)for(let o of Object.keys(t))typeof t[o]!="undefined"&&(typeof t[o]!="boolean"?n.setAttribute(o,t[o]):n.setAttribute(o,""));for(let o of r)so(n,o);return n}function fr(e){if(e>999){let t=+((e-950)%1e3>99);return`${((e+1e-6)/1e3).toFixed(t)}k`}else return e.toString()}function co(){return location.hash.substring(1)}function Vr(e){let t=M("a",{href:e});t.addEventListener("click",r=>r.stopPropagation()),t.click()}function $a(e){return L(b(window,"hashchange"),e).pipe(l(co),V(co()),A(t=>t.length>0),J(1))}function fo(e){return $a(e).pipe(l(t=>ce(`[id="${t}"]`)),A(t=>typeof t!="undefined"))}function zr(e){let t=matchMedia(e);return er(r=>t.addListener(()=>r(t.matches))).pipe(V(t.matches))}function uo(){let e=matchMedia("print");return L(b(window,"beforeprint").pipe(l(()=>!0)),b(window,"afterprint").pipe(l(()=>!1))).pipe(V(e.matches))}function Nr(e,t){return e.pipe(g(r=>r?t():_))}function ur(e,t={credentials:"same-origin"}){return ue(fetch(`${e}`,t)).pipe(fe(()=>_),g(r=>r.status!==200?Tt(()=>new Error(r.statusText)):k(r)))}function We(e,t){return ur(e,t).pipe(g(r=>r.json()),J(1))}function po(e,t){let r=new DOMParser;return ur(e,t).pipe(g(n=>n.text()),l(n=>r.parseFromString(n,"text/xml")),J(1))}function pr(e){let t=M("script",{src:e});return $(()=>(document.head.appendChild(t),L(b(t,"load"),b(t,"error").pipe(g(()=>Tt(()=>new ReferenceError(`Invalid script: ${e}`))))).pipe(l(()=>{}),R(()=>document.head.removeChild(t)),ge(1))))}function lo(){return{x:Math.max(0,scrollX),y:Math.max(0,scrollY)}}function mo(){return L(b(window,"scroll",{passive:!0}),b(window,"resize",{passive:!0})).pipe(l(lo),V(lo()))}function ho(){return{width:innerWidth,height:innerHeight}}function bo(){return b(window,"resize",{passive:!0}).pipe(l(ho),V(ho()))}function vo(){return Q([mo(),bo()]).pipe(l(([e,t])=>({offset:e,size:t})),J(1))}function lr(e,{viewport$:t,header$:r}){let n=t.pipe(Z("size")),o=Q([n,r]).pipe(l(()=>Xe(e)));return Q([r,t,o]).pipe(l(([{height:i},{offset:s,size:a},{x:c,y:f}])=>({offset:{x:s.x-c,y:s.y-f+i},size:a})))}(()=>{function e(n,o){parent.postMessage(n,o||"*")}function t(...n){return n.reduce((o,i)=>o.then(()=>new Promise(s=>{let a=document.createElement("script");a.src=i,a.onload=s,document.body.appendChild(a)})),Promise.resolve())}var r=class extends EventTarget{constructor(n){super(),this.url=n,this.m=i=>{i.source===this.w&&(this.dispatchEvent(new MessageEvent("message",{data:i.data})),this.onmessage&&this.onmessage(i))},this.e=(i,s,a,c,f)=>{if(s===`${this.url}`){let u=new ErrorEvent("error",{message:i,filename:s,lineno:a,colno:c,error:f});this.dispatchEvent(u),this.onerror&&this.onerror(u)}};let o=document.createElement("iframe");o.hidden=!0,document.body.appendChild(this.iframe=o),this.w.document.open(),this.w.document.write(` + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + + + + + +
+
+ + + + +

LLMs in financial world and Internet-scale Financial Data

+

The demos are shown in FinGPT and the data sources and supporting codes are in FinNLP

+

中文版请点击这里

+

Disclaimer: We are sharing codes for academic purpose under the MIT education license. Nothing herein is financial advice, and NOT a recommendation to trade real money. Please use common sense and always first consult a professional before trading or investing.

+

Ⅰ. Architecture

+

image-20230505200244043

+
    +
  • +

    The whole project is made up of 4 parts:

    +
  • +
  • +

    The first part is the Data Source, Here, we gather past and streaming data from the Internet.

    +
  • +
  • +

    Next, we push the data to the Data Engineering part where we clean the data, tokenize the data and do the prompt engineering

    +
  • +
  • +

    Then, the data is pushed to LLMs. Here, we may use LLMs in different kind of ways. We can not only use the collected data to train our own light-weight fine-tuning models but we can also use those data and trained models or LLM APIs to support our applications

    +
  • +
  • The last part would be the application part, here we can use data and LLMs to make many interesting applications.
  • +
+

Ⅱ. Data Sources

+

image-20230505200446477

+
    +
  • Due to space limitations, we only show a few of them.
  • +
+

1. News

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PlatformData TypeRelated MarketSpecified CompanyRange TypeSource TypeLimitsDocs (1e4)Support
YahooFinancial NewsUS StocksDate RangeOfficialN/A1,500+
ReutersFinancial NewsUS Stocks×Date RangeOfficialN/A1,500+
SinaFinancial NewsCN Stocks×Date RangeOfficialN/A2,000+
EastmoneyFinancial NewsCN StocksDate RangeOfficialN/A1,000+
YicaiFinancial NewsCN StocksDate RangeOfficialN/A500+Soon
CCTVGovernemnt NewsCN Stocks×Date RangeThird partyN/A4
US MainstreamFinancial NewsUS StocksDate RangeThird partyAccount (Free)3,200+
CN MainstreamFinancial NewsCN Stocks×Date RangeThird party¥500/year3000+
+
    +
  • FinGPT may have fewer docs than Bloomberg, we're on the same order of magnitude.
  • +
+

2. Social Media

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PlatformData TypeRelated MarketSpecified CompanyRange TypeSource TypeLimitsDocs (1e4)Support
TwitterTweetsUS StocksDate RangeOfficialN/A18,000+
StockTwitsTweetsUS StocksLastestOfficialN/A160,000+
Reddit (wallstreetbets)ThreadsUS Stocks×LastestOfficialN/A9+
WeiboTweetsCN StocksDate RangeOfficialCookies1,400,000+
WeiboTweetsCN StocksLastestOfficialN/A1,400,000+
+
    +
  • In BloomberGPT, they don’t collect social media data, but we believe that public opinion is one of the most important factors interfering the stock market.
  • +
+

3. Company Announcement

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PlatformData TypeRelated MarketSpecified CompanyRange TypeSource TypeLimitsDocs (1e4)Support
Juchao (Official Website)TextCN StocksDate RangeOfficialN/A2,790+
SEC (Official Website)TextUS StocksDate RangeOfficialN/A1,440+
+
    +
  • Since we collect data from different stock markets, we have more filing docs than Bloomberg GPT.
  • +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PlatformData TypeRelated MarketData SourceSpecified CompanyRange TypeSource TypeLimits
Google TrendsIndexUS StocksGoogle TrendsDate RangeOfficialN/A
Baidu IndexIndexCN StocksSoon----
+

5. Data Sets

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Data SourceTypeStocksDatesAvaliable
AShareNews36802018-07-01 to 2021-11-30
stocknet-datasetTweets872014-01-02 to 2015-12-30
CHRNNTweets382017-01-03 to 2017-12-28
+

Ⅲ. Models

+

image-20230505200618504

+
    +
  • In data-centric NLP, we don’t train the model from the beginning. We only call APIs and do light-weight fine-tunings.
  • +
  • The left part is some LLM APIs that we may use and the middle part is the models that we may use to perform fine-tunings and the right part is some of the Fine-tuning methods
  • +
+

1. Fine-tuning: Tensor Layers (LoRA)

+

image-20230505200944411

+
    +
  • In FinGPT, we fine-tune a pre-trained LLM using a new financial dataset.High-quality labeled data is one of the most important key to many successful LLMs including ChatGPT
  • +
  • However, those high-quality labeled data are often very expensive and time-consuming and we may need help from professional finance experts.
  • +
  • If our goal is to use LLMs to analyze financial-related text data and help with quantitative trading, why not let the market do the labeling for us?
  • +
  • So here, we use the related stock price change percent of each news as the output label, we use the threshold to split the label into three groups positive, negative, and neutral, and use them and the label of the news sentiment.
  • +
  • In correspondence, we also ask the model to select one of positive, negative, and neutral as the output in the prompt engineer part so we the make the best use of the pre-trained information
  • +
  • By using LoRA we may reduced the trainable parameters from 6.17B to 3.67M
  • +
  • As the table presents, compared with chatGLM, FinGPT can achieve large improvement on multiple metrics. it may be inappropriate to use our model to quantitative trading directly. Since most news titles are neutral, most of the original outputs of the LLMs are Neutral, so LLM perform poorly in positive and negative labels and those labels are what might be useful in quantitative trading.
  • +
  • However, after fine-tuning, we have witness huge improvements in the prediction of positive and negative labels.
  • +
  • That’s also why the model can achieve positive trading results.
  • +
+

2. Fine-tuning: Reinforcement Learning on Stock Prices (RLSP)

+

image-20230505201209946

+
    +
  • In the same way, we may use RL on Stock Prices (RLSP) to replace RL on Human feedback used by ChatGPT.
  • +
+

Ⅳ. Applications

+

1. Robo Advisor

+

image-20230505201913233

+
    +
  • ChatGPT can make the investment advises just like a pro.
  • +
  • In this example the raising stock price of the Apple is in accordance with ChatGPT’s prediction made by the analysis of news
  • +
+

2. Quantitative Trading

+

image-20230505201841001

+
    +
  • We may also use News, Social media tweet or filing to build sentiment factors, the right part is the trading results just by the signal of the twitter tweets and ChatGPT, the data is from a data set called stocknet-dataset.
  • +
  • As you may see from the picture, the trading signals generated by ChatGPT are so good that we may even achieve good results just by trading according to twitter sentiment factors.
  • +
  • So we may even achieve better results by combining price factors.
  • +
+

3. Low-code development

+

image-20230505202028292

+
    +
  • We can use the help of LLMs to write codes.
  • +
  • The right part shows how we can develop our factors and other codes quickly and efficiently.
  • +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_Company_Announcement/index.html b/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_Company_Announcement/index.html new file mode 100644 index 0000000000000000000000000000000000000000..a734e3149f46fa70feb51662c93ebb0f58bad577 --- /dev/null +++ b/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_Company_Announcement/index.html @@ -0,0 +1,2850 @@ + + + + + + + + + + + + + + + + + + + + + + Company Announcement - FinGPT & FinNLP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + +

Company Announcement

+ + + + + + + + + + + + + + + + + + + +
+ + +
+ + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_News/index.html b/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_News/index.html new file mode 100644 index 0000000000000000000000000000000000000000..0cef47b6e56ef26e5f33405ac1d3c5082164132a --- /dev/null +++ b/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_News/index.html @@ -0,0 +1,7324 @@ + + + + + + + + + + + + + + + + + + + + + + + + News - FinGPT & FinNLP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+ +
+
+ + + +
+
+ + + + +

News

+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
+ + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_Social_Media/index.html b/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_Social_Media/index.html new file mode 100644 index 0000000000000000000000000000000000000000..95145adf5cbdddaaf2124ae15e4c97e94fa3045e --- /dev/null +++ b/FinNLP/docs/FinNLP/site/jupyter/Data_Sources_Social_Media/index.html @@ -0,0 +1,5102 @@ + + + + + + + + + + + + + + + + + + + + + + + + Social Media - FinGPT & FinNLP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+ +
+ + + +
+
+ + + + +

Social Media

+ + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + +
+ + + + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/site/sitemap.xml b/FinNLP/docs/FinNLP/site/sitemap.xml new file mode 100644 index 0000000000000000000000000000000000000000..9ba7561dcd4667af6a2717232c6cc7dbe8b4a9b6 --- /dev/null +++ b/FinNLP/docs/FinNLP/site/sitemap.xml @@ -0,0 +1,28 @@ + + + + None + 2023-08-24 + daily + + + None + 2023-08-24 + daily + + + None + 2023-08-24 + daily + + + None + 2023-08-24 + daily + + + None + 2023-08-24 + daily + + \ No newline at end of file diff --git a/FinNLP/docs/FinNLP/site/sitemap.xml.gz b/FinNLP/docs/FinNLP/site/sitemap.xml.gz new file mode 100644 index 0000000000000000000000000000000000000000..3125477a8a9864594e7005dc29b6b3f6f00b41d4 --- /dev/null +++ b/FinNLP/docs/FinNLP/site/sitemap.xml.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8690e5e23dd4cba4c30f1f0df5602342cf85e1f51bdf402fbbe30f58196dab54 +size 199 diff --git a/FinNLP/docs/FinNLP/site/zh/index.html b/FinNLP/docs/FinNLP/site/zh/index.html new file mode 100644 index 0000000000000000000000000000000000000000..accd7d2d9e702546b83f8f731f3660e434ade850 --- /dev/null +++ b/FinNLP/docs/FinNLP/site/zh/index.html @@ -0,0 +1,1138 @@ + + + + + + + + + + + + + + + + + + + + 互联网金融数据 - FinGPT & FinNLP + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + +
+ + +
+ +
+ + + + + + +
+
+ + + +
+
+
+ + + + +
+
+
+ + + +
+
+
+ + + +
+
+
+ + + +
+
+ + + + +

互联网金融数据

+

演示内容请参见FinGPT

+

免责声明:我们根据MIT教育许可证的规定共享代码以供学术研究之用。此处不构成任何金融建议,亦非交易真实资金的推荐。在交易或投资之前请使用常识并首先咨询专业人士。

+

Ⅰ. 架构

+

image-20230505200244043

+
    +
  • +

    整个项目由4个部分组成:

    +
  • +
  • +

    第一部分是数据源,在这里,我们从互联网上收集历史和流媒体数据。

    +
  • +
  • +

    接下来,我们将数据推送到数据工程部分,在这里我们会对数据进行清洗,标记化处理和提示工程。

    +
  • +
  • +

    然后,数据被推送到大语言模型(LLMs)。在这里,我们可以以不同的方式使用LLMs。我们不仅可以使用收集到的数据来训练我们自己的轻量级微调模型,还可以使用这些数据和训练好的模型LLM API来支持我们的应用程序。

    +
  • +
  • +

    最后一部分将是应用程序部分,我们可以使用数据和LLMs来制作许多有趣的应用程序。

    +
  • +
+

Ⅱ. 数据源

+

image-20230505200446477

+
    +
  • 由于空间限制,我们只展示了其中一部分。
  • +
+

1. 新闻

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
平台数据类型相关市场指定公司时间范围数据源类型限制条件文档数量(万)支持情况
雅虎金融新闻美国股票时间范围官方N/A1,500+
路透社金融新闻美国股票×时间范围官方N/A1,500+
新浪金融新闻中国股票×时间范围官方N/A2,000+
东方财富金融新闻中国股票时间范围官方N/A1,000+
第一财经金融新闻中国股票时间范围官方N/A500+即将
央视政府新闻中国股票×时间范围第三方N/A4
美国主流媒体金融新闻美国股票时间范围第三方账户 (免费)3,200+
中国主流媒体金融新闻中国股票×时间范围第三方¥500/年3000+
+
    +
  • FinGPT可能比Bloomberg的文档数目更少,但我们在同一个数量级上。
  • +
+

2. 社交媒体

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
平台数据类型相关市场指定公司范围类型来源类型限制文档 (1e4)支持
Twitter推文美国股票时间范围官方N/A18,000+
StockTwits推文美国股票最新官方N/A160,000+
Reddit (wallstreetbets)帖子美国股票×最新官方N/A9+
微博推文中国股票时间范围官方Cookies1,400,000+
微博推文中国股票最新官方N/A1,400,000+
+
    +
  • BloomberGPT 中,他们不收集社交媒体数据,但我们认为公众舆论是干扰股票市场的最重要因素之一
  • +
+

3. 公司公告

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
平台数据类型相关市场指定公司范围类型数据来源限制文档数 (1e4)支持情况
巨潮网 (官方)文本中国股票时间范围官方N/A2,790+
美国证监会 (官方)文本美国股票时间范围官方N/A1,440+
+
    +
  • 由于我们从不同的股票市场收集数据,因此我们比Bloomberg GPT有更多的申报文档。
  • +
+

4. 趋势

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
平台数据类型相关市场数据源指定公司范围类型源类型限制
谷歌趋势指数美国股票Google Trends日期范围官方N/A
百度指数指数中国股票即将推出----
+

5. 数据集

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
数据源类型股票日期可用性
AShare新闻36802018-07-01 到 2021-11-30
stocknet-dataset推文872014-01-02 到 2015-12-30
CHRNN推文382017-01-03 到 2017-12-28
+

Ⅲ. 模型

+

image-20230505200618504

+
    +
  • 在数据中心的自然语言处理领域,我们不需要从头开始训练模型。我们只需要调用API和进行轻量级的微调。
  • +
  • 左边是一些可能会用到的LLM APIs,中间是我们可能用来进行微调的模型,右边是一些微调方法。
  • +
+

1. 微调:Tensor Layers (LoRA)

+

image-20230505200944411

+
    +
  • 在FinGPT中,我们使用新的金融数据集对预训练的LLM进行微调。高质量的标记数据是许多成功的LLM(包括ChatGPT)的最重要的关键之一。
  • +
  • 然而,这些高质量的标记数据通常非常昂贵和耗时,并且我们可能需要金融专家的帮助。
  • +
  • 如果我们的目标是使用LLM分析与金融相关的文本数据并帮助量化交易,为什么不让市场为我们做标记呢?
  • +
  • 因此,在这里,我们使用每个新闻相关的股票价格变化百分比作为输出标签,我们使用阈值将标签分成三组(积极的,消极的和中立的),并使用它们和新闻情感的标签。
  • +
  • 相应地,在提示工程师部分,我们还要求模型选择其中一个正面的,负面的和中性的作为输出,以便我们充分利用预训练信息。
  • +
  • 通过使用LoRA,我们可以将可训练参数减少从6.17B到3.67M。
  • +
  • 如表格所示,与chatGLM相比,FinGPT可以在多个指标上实现大幅改善。然而,直接将我们的模型用于量化交易可能是不合适的。由于大多数新闻标题都是中性的,LLMs的大多数原始输出都是中性的,因此LLMs在积极和消极的标签上表现不佳,而这些标签可能对于量化交易是有用的。
  • +
  • 然而,在微调之后,我们已经见证了在预测积极和消极标签方面的巨大改进。
  • +
  • 这也是为什么该模型可以实现积极的交易结果的原因。
  • +
+

2. 微调:强化学习在股价上的应用 (RLSP)

+

image-20230505201209946

+
    +
  • 同样地,我们可以使用股价上的强化学习(RLSP)来替换ChatGPT中使用的人类反馈上的强化学习。
  • +
+

Ⅳ. 应用

+

1. 智能投顾

+

image-20230505201913233

+
    +
  • ChatGPT可以像专业人士一样进行投资建议。
  • +
  • 在这个例子中,苹果的股价上涨与ChatGPT分析新闻的预测相符
  • +
+

2. 量化交易

+

image-20230505201841001

+
    +
  • 我们还可以使用新闻、社交媒体推文或者公司公告来构建情感因子,右侧的部分是由Twitter推文和ChatGPT信号产生的交易结果,数据来自于一个称为stocknet-dataset的数据集。
  • +
  • 正如您从图片中所看到的,由ChatGPT生成的交易信号非常出色,我们甚至可以仅通过根据Twitter情感因子交易而获得良好的结果
  • +
  • 因此,我们可以通过结合价格因素来获得更好的结果。
  • +
+

3. 低代码开发

+

image-20230505202028292

+
    +
  • 我们可以使用LLMs的帮助来编写代码。
  • +
  • 右侧显示了我们如何快速高效地开发我们的因子和其他代码。
  • +
+ + + + + + +
+
+ + +
+ +
+ + + +
+
+
+
+ + + + + + + + + \ No newline at end of file diff --git a/FinNLP/finnlp/benchmarks/fiqa.py b/FinNLP/finnlp/benchmarks/fiqa.py new file mode 100644 index 0000000000000000000000000000000000000000..03dd806c004fced88f6054cbe1064a33bb8870b0 --- /dev/null +++ b/FinNLP/finnlp/benchmarks/fiqa.py @@ -0,0 +1,85 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def add_instructions(x): + if x.format == "post": + return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}." + else: + return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + +def make_label(x): + if x < - 0.1: return "negative" + elif x >=-0.1 and x < 0.1: return "neutral" + elif x >= 0.1: return "positive" + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_fiqa(model, tokenizer, batch_size = 8, prompt_fun = None ): + dataset = load_dataset('pauri32/fiqa-2018') + dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ]) + dataset = dataset.train_test_split(0.226, seed = 42)['test'] + dataset = dataset.to_pandas() + dataset["output"] = dataset.sentiment_score.apply(make_label) + if prompt_fun is None: + dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + else: + dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) + + dataset = dataset[['sentence', 'output',"instruction"]] + dataset.columns = ["input", "output","instruction"] + dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") + + context = dataset['context'].tolist() + total_steps = dataset.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + out_text_list = [] + + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + # tokens.pop('token_type_ids') + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + dataset["out_text"] = out_text_list + dataset["new_target"] = dataset["target"].apply(change_target) + dataset["new_out"] = dataset["out_text"].apply(change_target) + + acc = accuracy_score(dataset["new_target"], dataset["new_out"]) + f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") + f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") + f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return dataset \ No newline at end of file diff --git a/FinNLP/finnlp/benchmarks/fpb.py b/FinNLP/finnlp/benchmarks/fpb.py new file mode 100644 index 0000000000000000000000000000000000000000..39d4730656a281fc356b53a866a812bde9aa16f1 --- /dev/null +++ b/FinNLP/finnlp/benchmarks/fpb.py @@ -0,0 +1,80 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +dic = { + 0:"negative", + 1:'neutral', + 2:'positive', + } + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_fpb(model, tokenizer, batch_size = 8, prompt_fun = None ): + instructions = load_dataset("financial_phrasebank", "sentences_50agree") + instructions = instructions["train"] + instructions = instructions.train_test_split(seed = 42)['test'] + instructions = instructions.to_pandas() + instructions.columns = ["input", "output"] + instructions["output"] = instructions["output"].apply(lambda x:dic[x]) + + if prompt_fun is None: + instructions["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + else: + instructions["instruction"] = instructions.apply(prompt_fun, axis = 1) + + instructions[["context","target"]] = instructions.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{instructions['context'][0]}\n\n") + + + context = instructions['context'].tolist() + + total_steps = instructions.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + + out_text_list = [] + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + instructions["out_text"] = out_text_list + instructions["new_target"] = instructions["target"].apply(change_target) + instructions["new_out"] = instructions["out_text"].apply(change_target) + + acc = accuracy_score(instructions["new_target"], instructions["new_out"]) + f1_macro = f1_score(instructions["new_target"], instructions["new_out"], average = "macro") + f1_micro = f1_score(instructions["new_target"], instructions["new_out"], average = "micro") + f1_weighted = f1_score(instructions["new_target"], instructions["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return instructions \ No newline at end of file diff --git a/FinNLP/finnlp/benchmarks/nwgi.py b/FinNLP/finnlp/benchmarks/nwgi.py new file mode 100644 index 0000000000000000000000000000000000000000..9de327d340054d65ee7bab4d1874dd8e6d527a1e --- /dev/null +++ b/FinNLP/finnlp/benchmarks/nwgi.py @@ -0,0 +1,83 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +dic = { + 'strong negative':"negative", + 'moderately negative':"negative", + 'mildly negative':"neutral", + 'strong positive':"positive", + 'moderately positive':"positive", + 'mildly positive':'neutral', + 'neutral':'neutral', +} + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_nwgi(model, tokenizer, batch_size = 8, prompt_fun = None ): + dataset = datasets.load_dataset('oliverwang15/news_with_gpt_instructions') + dataset = dataset['test'].to_pandas() + dataset['output'] = dataset['label'].apply(lambda x:dic[x]) + + if prompt_fun is None: + dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + else: + dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) + dataset["input"] = dataset["news"] + + dataset = dataset[['input', 'output', 'instruction']] + dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") + + context = dataset['context'].tolist() + + total_steps = dataset.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + + out_text_list = [] + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + # tokens.pop('token_type_ids') + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + dataset["out_text"] = out_text_list + dataset["new_target"] = dataset["target"].apply(change_target) + dataset["new_out"] = dataset["out_text"].apply(change_target) + + acc = accuracy_score(dataset["new_target"], dataset["new_out"]) + f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") + f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") + f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return dataset diff --git a/FinNLP/finnlp/benchmarks/tfns.py b/FinNLP/finnlp/benchmarks/tfns.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8040194310d7483f01bcb2d917bb2fbc8f4485 --- /dev/null +++ b/FinNLP/finnlp/benchmarks/tfns.py @@ -0,0 +1,79 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +dic = { + 0:"negative", + 1:'positive', + 2:'neutral', +} + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_tfns(model, tokenizer, batch_size = 8, prompt_fun = None ): + dataset = load_dataset('zeroshot/twitter-financial-news-sentiment') + dataset = dataset['validation'] + dataset = dataset.to_pandas() + dataset['label'] = dataset['label'].apply(lambda x:dic[x]) + + if prompt_fun is None: + dataset["instruction"] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.' + else: + dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) + + dataset.columns = ['input', 'output', 'instruction'] + dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") + + context = dataset['context'].tolist() + + total_steps = dataset.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + + out_text_list = [] + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + # tokens.pop('token_type_ids') + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + dataset["out_text"] = out_text_list + dataset["new_target"] = dataset["target"].apply(change_target) + dataset["new_out"] = dataset["out_text"].apply(change_target) + + acc = accuracy_score(dataset["new_target"], dataset["new_out"]) + f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") + f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") + f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return dataset \ No newline at end of file diff --git a/FinNLP/finnlp/data_engineering/data_cleaning.py b/FinNLP/finnlp/data_engineering/data_cleaning.py new file mode 100644 index 0000000000000000000000000000000000000000..1e69446f93f5329ba724f23cf1f3bd552dcbfafe --- /dev/null +++ b/FinNLP/finnlp/data_engineering/data_cleaning.py @@ -0,0 +1,104 @@ +import re +import os +from transformers import BertTokenizer +from datasketch import MinHash, MinHashLSH +from nltk import ngrams + +# junk data +def junk_eliminate(df, re_expression = r'[&#<>{}\[\]\\]', threshold=0.01, min_len=10): + RE_SUSPICIOUS = re.compile(re_expression) + def impurity(text, min_len=min_len): + """returns the share of suspicious characters in a text""" + if text == None or len(text) < min_len: + return 0 + else: + return len(RE_SUSPICIOUS.findall(text))/len(text) + df['impurity'] = df['text'].apply(impurity, min_len=min_len) + total_num_docs = len(df) + impurity_num_docs = len(df[df['impurity'] >= threshold]) + impurity_ratio = impurity_num_docs / total_num_docs + purity_df = df[df['impurity'] < threshold] + return purity_df, impurity_ratio + +# Biased Content +def toxic_eliminate(df, l_kind='en'): + ''' + l_kind = ['en', 'zh'] + ''' + os.system(f"wget https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/blob/master/{l_kind}") + with open(f'./{l_kind}', 'r') as f: + lines = f.readlines() + banned_words = set([line.rstrip('\n') for line in lines]) + df['banned_words_in_text'] = df['text'].apply(lambda text: [word for word in banned_words if word in text.lower().split()]) + df['matches'] = df['banned_words_in_text'].apply(lambda words: len(words) > 0) + total_num_docs = len(df) + biased_num_docs = df['matches'].sum() + biased_content_ratio = biased_num_docs / total_num_docs + non_toxic_df = df[df['matches'] == 0] + return non_toxic_df, biased_content_ratio + +# Too Short Document +def short_eliminate(df, tokenizer = BertTokenizer.from_pretrained('bert-base-uncased'), min_len=100): + # Create a new column with the number of tokens for each text + df['text_length'] = df['text'].apply(lambda text: len(tokenizer.tokenize(text))) + total_num_docs = len(df) + too_short_docs = len(df[df['text_length'] <= min_len]) + too_short_doc_ratio = too_short_docs / total_num_docs + not_short_df = df[df['text_length'] > min_len] + return not_short_df, too_short_doc_ratio + +# Contamination +def process_data(df): + minhashes = {} + for idx, text in enumerate(df['text']): + minhash = MinHash(num_perm=128) + for d in ngrams(text, 13): + s = "".join(d).encode('utf-8') + minhash.update(s) + minhashes[idx] = minhash + return minhashes + +def contamination_eliminate(train_dataset, test_dataset): + train_minhashes = process_data(train_dataset) + test_minhashes = process_data(test_dataset) + + + lsh = MinHashLSH(threshold=0.8, num_perm=128) + + for idx, minhash in train_minhashes.items(): + lsh.insert(idx, minhash) + + duplicates_count = 0 + for idx, minhash in test_minhashes.items(): + result = lsh.query(minhash) + if len(result) > 0: + duplicates_count += 1 + contamination_ratio = duplicates_count / len(test_dataset) + return contamination_ratio + +# Duplication +def duplication_eliminate(df): + lsh = MinHashLSH(threshold=0.85, num_perm=128) + for i, text in enumerate(df['text']): + minhash = MinHash(num_perm=128) + for word in text.split(): + minhash.update(word.encode('utf-8')) + lsh.insert(str(i), minhash) + + unique_documents = set() + + for i, text in enumerate(df['text']): + query_minhash = MinHash(num_perm=128) + for word in text.split(): + query_minhash.update(word.encode('utf-8')) + results = lsh.query(query_minhash) + try: + unique_documents.add(results[0]) + except Exception as e: + print(f'error: {e}') + total_unique_documents = len(unique_documents) + total_documents = len(df) + duplication_ratio = (total_documents - total_unique_documents) / total_documents + return unique_documents, duplication_ratio + + diff --git a/FinNLP/finnlp/data_sources/__init__.py b/FinNLP/finnlp/data_sources/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/data_sources/_base.py b/FinNLP/finnlp/data_sources/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..b780f2963054ec83c8c091f2908697237b0c4329 --- /dev/null +++ b/FinNLP/finnlp/data_sources/_base.py @@ -0,0 +1,80 @@ +from finnlp.utils.get_proxy import get_china_free_proxy, get_us_free_proxy, Kuaidaili +import requests + +class FinNLP_Downloader: + def __init__(self, args = {}): + self.use_proxy = True if "use_proxy" in args.keys() else False + if self.use_proxy: + self.country = args["use_proxy"] + else: + self.country = None + self.max_retry = args["max_retry"] if "max_retry" in args.keys() else 1 + self.proxy_pages = args["proxy_pages"] if "proxy_pages" in args.keys() else 5 + if self.use_proxy: + if "kuaidaili" in self.country: + # tunnel, username, password + assert "tunnel" in args.keys(), "Please make sure \'tunnel\' in your keys" + assert "username" in args.keys(), "Please make sure \'username\' in your keys" + assert "password" in args.keys(), "Please make sure \'password\' in your keys" + self.proxy_list = Kuaidaili(args["tunnel"], args["username"], args["password"]) + else: + self.proxy_id = 0 + self.proxy_list = self._update_proxy() + else: + self.proxy_list = [] + + def _get_proxy(self): + if self.use_proxy: + if "kuaidaili" in self.country: + proxy = self.proxy_list.get_kuaidaili_tunnel_proxy() + return proxy + elif len(self.proxy_list) >0: + proxy = self.proxy_list[self.proxy_id] + self.proxy_id += 1 + if self.proxy_id == len(self.proxy_list): + self.proxy_id = 0 + return proxy + else: + return None + + def _update_proxy(self): + if "china" in self.country or "China" in self.country: + return get_china_free_proxy(self.proxy_pages) + else: + return get_us_free_proxy(self.proxy_pages) + + def _request_get(self, url, headers = None, verify = None, params = None): + if headers is None: + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" + } + max_retry = self.max_retry + proxies = self._get_proxy() + for _ in range(max_retry): + try: + response = requests.get(url = url, proxies = proxies, headers = headers, verify = verify, params = params) + if response.status_code == 200: + break + except: + response = None + + if response is not None and response.status_code != 200: + response = None + + return response + + def _request_post(self, url, headers, json): + max_retry = self.max_retry + proxies = self._get_proxy() + for _ in range(max_retry): + try: + response = requests.post(url = url, headers = headers, json = json, proxies = proxies) + if response.status_code == 200: + break + except: + response = None + + if response is not None and response.status_code != 200: + response = None + + return response diff --git a/FinNLP/finnlp/data_sources/company_announcement/__init__.py b/FinNLP/finnlp/data_sources/company_announcement/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/data_sources/company_announcement/_base.py b/FinNLP/finnlp/data_sources/company_announcement/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..85eb77202f15f26181ea46524d87ce635f4f18ac --- /dev/null +++ b/FinNLP/finnlp/data_sources/company_announcement/_base.py @@ -0,0 +1,22 @@ +from finnlp.data_sources._base import FinNLP_Downloader + +class Company_Announcement_Downloader(FinNLP_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + pass + + def download_date_range_all(self, start_date, end_date): + pass + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): + pass + + def download_streaming_all(self, rounds = 3): + pass + + def download_streaming_stock(self, stock = None, rounds = 3): + pass + + def clean_data(self): + pass \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/company_announcement/juchao.py b/FinNLP/finnlp/data_sources/company_announcement/juchao.py new file mode 100644 index 0000000000000000000000000000000000000000..6ab3a86ace934eaeb42cd2bbca708b1193ed301d --- /dev/null +++ b/FinNLP/finnlp/data_sources/company_announcement/juchao.py @@ -0,0 +1,138 @@ +from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader + +import requests +import time +import json +import os +import pandas as pd +from tqdm import tqdm +from PyPDF2 import PdfReader + +class Juchao_Announcement(Company_Announcement_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_date_range_stock(self,start_date, end_date, stock = "000001",max_page = 100, searchkey= "", get_content = False, save_dir = "./tmp/" , delate_pdf = False): + self.org_dict = self._get_orgid() + + # download the first page + res = self._get_open_page(start_date, end_date, stock, 1, searchkey) + total_pages = res["totalpages"]+1 + + if res["announcements"] is None: + print(f"Nothing related to your searchkey({searchkey}) is found, you may try another one or just leave it blank") + else: + tmp_df = self._process_data(res) + self.dataframe = pd.concat([self.dataframe, tmp_df]) + + page = 2 + # download other page + pbar = tqdm(total=total_pages,desc="Downloading by page...") + + for _ in range(max_page): + res = self._get_open_page(start_date, end_date, stock, page, searchkey) + if res["announcements"] is None: + break + tmp_df = self._process_data(res) + self.dataframe = pd.concat([self.dataframe, tmp_df]) + pbar.update(1) + page += 1 + pbar.update(1) + # Convert Time + self.dataframe.announcementTime = self.dataframe.announcementTime.apply(lambda x:time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x/1000))) + self.dataframe.announcementTime = pd.to_datetime(self.dataframe.announcementTime) + + if get_content: + pbar = tqdm(total=self.dataframe.shape[0], desc="Getting the text data...") + self.dataframe[["PDF_path","Content"]] = self.dataframe.apply(lambda x: self._get_pdfs(x,save_dir, delate_pdf, pbar),axis= 1,result_type = "expand") + if delate_pdf: + os.removedirs(save_dir) + + self.dataframe = self.dataframe.reset_index(drop = True) + + def _get_open_page(self,start_date,end_date, stock,page, searchkey): + url = "http://www.cninfo.com.cn/new/hisAnnouncement/query?" + headers = { + "Referer": "http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", + } + data = { + "pageNum": page, + "pageSize": "30", + "column": "szse", + "tabName": "fulltext", + "plate":"", + "stock":stock + "," + self.org_dict[stock] , + "searchkey": searchkey, + "secid":"", + "category":"", + "trade":"", + "seDate": f"{start_date}~{end_date}", + "sortName": "", + "sortType": "", + "isHLtitle": "true", + } + res = requests.post(url = url, headers = headers, data = data) + if res.status_code != 200: + raise ConnectionError + + res = json.loads(res.text) + return res + + def _process_data(self,res): + if res is None: + return res + else: + return pd.DataFrame(res["announcements"]) + + def _get_pdfs(self,x, save_dir, delate_pdf,pbar): + os.makedirs(save_dir, exist_ok= True) + adjunctUrl = x.adjunctUrl + pdf_base_url = "http://static.cninfo.com.cn/" + pdf_url = pdf_base_url + adjunctUrl + responsepdf = self._request_get(pdf_url) + + + if responsepdf is None: + pbar.update(1) + return ("Failed Download","Failed Download") + + else: + # make preparations + file_name = x.announcementTitle + file_name = "".join(file_name.split("")) + file_name = "".join(file_name.split("")) + file_name + file_name = f"{x.secCode}_{x.secName}_{file_name}.pdf" + file_path = os.path.join(save_dir, file_name) + + # save pdf + with open(file_path, "wb") as f: + f.write(responsepdf.content) + + # analyze pdf + with open(file_path, "rb") as filehandle: + pdf = PdfReader(filehandle) + text_all = "" + for page in pdf.pages: + text = page.extract_text() + text = "".join(text.split("\n")) + text_all += text + pbar.update(1) + + if delate_pdf: + os.remove(file_path) + return ("removed", text_all) + else: + return (file_path, text_all) + + def _get_orgid(self): + org_dict = {} + org_json = self._request_get("http://www.cninfo.com.cn/new/data/szse_stock.json").json()["stockList"] + + for i in range(len(org_json)): + org_dict[org_json[i]["code"]] = org_json[i]["orgId"] + + return org_dict \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/company_announcement/sec.py b/FinNLP/finnlp/data_sources/company_announcement/sec.py new file mode 100644 index 0000000000000000000000000000000000000000..cc3ee7ce76a3f42a88e6b89823bf29ccad1d877e --- /dev/null +++ b/FinNLP/finnlp/data_sources/company_announcement/sec.py @@ -0,0 +1,145 @@ +from finnlp.data_sources.company_announcement._base import Company_Announcement_Downloader + +from tqdm import tqdm +from lxml import etree +import pandas as pd +import requests +import json +import time + +class SEC_Announcement(Company_Announcement_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL", delay = 0.1): + entityName = self._get_entity_name(stock) + # first page + total_pages = self._gather_one_page(start_date, end_date, 1, entityName, delay) + # other pages + if total_pages>1: + for page in tqdm(range(1, total_pages), desc="Downloading other page..."): + self._gather_one_page(start_date, end_date, page + 1, entityName, delay ) + + self.dataframe = self.dataframe.reset_index(drop = True) + + def _get_entity_name(self, stock = "AAPL"): + url = "https://efts.sec.gov/LATEST/search-index" + headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" + } + params = { + "keysTyped":stock + } + resp = self._request_get(url = url, headers= headers, params= params) + if resp is None: + raise ConnectionError("Can't get entity name") + + res = json.loads(resp.text) + item_list = res["hits"]["hits"] + entityName_list = [] + for item in item_list: + c_name_one = item["_source"]["entity_words"] + c_name_two = item["_id"].zfill(10) + entityName = f"{c_name_one} (CIK {c_name_two})" + entityName_list.append(entityName) + + entityName = entityName_list[0] + + return entityName + + def _gather_one_page(self, start_date, end_date, page, entityName = "Apple Inc. (AAPL) (CIK 0000320193)", delay = 0.01): + from_ = (page-1)*100 + url = "https://efts.sec.gov/LATEST/search-index" + headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" + } + params = { + "dateRange": "all", + "entityName": entityName, + "startdt": start_date, + "enddt": end_date, + "from" : from_, + "page" : page, + } + + resp = self._request_get(url = url, headers= headers, params= params) + + if resp is None: + return 'Error' + res = json.loads(resp.text) + + # total + total_items = res["hits"]["total"]["value"] + if total_items % 100 == 0: + total_pages = total_items // 100 + else: + total_pages = total_items // 100 + 1 + + items = res["hits"]["hits"] + + url_base = "https://www.sec.gov/Archives/edgar/data" + + for item in tqdm(items, desc="Downloading by item..." ): + url_third = item["_source"]["xsl"] + url_second, url_fourth = item["_id"].split(":") + url_second = url_second.split("-") + url_first = url_second[0] + url_first = url_first.strip("0") + url_second = ''.join(url_second) + url_first, url_second, url_fourth + + if url_third is not None: + url_new = f"{url_base}/{url_first}/{url_second}/{url_third}/{url_fourth}" + else: + url_new = f"{url_base}/{url_first}/{url_second}/{url_fourth}" + respn = self._request_get(url = url_new, headers= headers) + if respn is None: + continue + try: + res = etree.HTML(respn.text) + content = res.xpath("/html/body//text()") + content = [c for c in content if c != "\n"] + content = "".join(content) + + _id = item["_id"] + ciks = item["_source"]["ciks"] + period_ending = item["_source"]["period_ending"] + root_form = item["_source"]["root_form"] + file_num = item["_source"]["file_num"] + display_names = item["_source"]["display_names"] + xsl = item["_source"]["xsl"] + sequence = item["_source"]["sequence"] + file_date = item["_source"]["file_date"] + biz_states = item["_source"]["biz_states"] + sics = item["_source"]["sics"] + form = item["_source"]["form"] + adsh = item["_source"]["adsh"] + film_num = item["_source"]["film_num"] + biz_locations = item["_source"]["biz_locations"] + file_type = item["_source"]["file_type"] + file_description = item["_source"]["file_description"] + inc_states = item["_source"]["inc_states"] + ite = item["_source"]["items"] + + data = [ + _id, ciks, period_ending, root_form, file_num, display_names, xsl, sequence, + file_date, biz_states, sics, form, adsh, film_num, biz_locations, file_type, + file_description, inc_states, ite, content + ] + columns = [ + "_id", "ciks", "period_ending", "root_form", "file_num", "display_names", "xsl", "sequence", + "file_date", "biz_states", "sics", "form", "adsh", "film_num", "biz_locations", "file_type", + "file_description", "inc_states", "ite", "content" + ] + tmp = pd.DataFrame(data = data).T + tmp.columns = columns + + self.dataframe = pd.concat([self.dataframe, tmp]) + time.sleep(delay) + except: + continue + + return total_pages + diff --git a/FinNLP/finnlp/data_sources/company_announcement/sina.py b/FinNLP/finnlp/data_sources/company_announcement/sina.py new file mode 100644 index 0000000000000000000000000000000000000000..701cf8e79fa757e610de3ca6c6d35cac387b4657 --- /dev/null +++ b/FinNLP/finnlp/data_sources/company_announcement/sina.py @@ -0,0 +1,87 @@ +import time +import requests +from lxml import etree +from tqdm.notebook import tqdm +import pandas as pd + +class Sina_Announcement_Downloader: + + def __init__(self, args = {}): + pass + + def download(self, stock = "all",max_page = 100): + page = 0 + df = pd.DataFrame() + print(f"Getting page: ",end = "") + while page < max_page: + print(page, end = " ") + headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", + 'Accept-Encoding':'gzip, deflate, br',} + url = f"https://vip.stock.finance.sina.com.cn/corp/view/vCB_AllBulletin.php?stockid={stock}&Page={page}" + response = requests.get(url = url,headers=headers) + # response.encoding = "GBK" + # print(response.content.decode('GBK')) + text = response.content.decode('GBK') + html = etree.HTML(text) + + # get announcement date + date_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/text()") + if len(date_list) <= 0: + break + date_list = [date.strip('.\r').strip('.\n').strip('.\xa0').strip(' ') for date in date_list] + date_list = [date for date in date_list if len(date) == 10] + + + # get headlines and urls + url_root = "https://vip.stock.finance.sina.com.cn" + a_list = html.xpath("/html/body/div[6]/div[2]/div[2]/table[2]/tr/td[2]/div[1]/ul/a") + headline_list = [a.xpath("./text()")[0] for a in a_list ] + url_list = [url_root + a.xpath("./@href")[0] for a in a_list ] + + tmp_df = { + "date": date_list, + "headline": headline_list, + "url": url_list, + } + tmp_df = pd.DataFrame(tmp_df) + df = pd.concat([df,tmp_df]) + page += 1 + + + with tqdm(total = df.shape[0],desc = "Getting Announcement content" ) as pbar: + df["content"] = df.apply(lambda x: self.get_content(x,pbar), axis=1 ) + + df = df.reset_index(drop=True) + + return df + + def get_content(self,x,pbar,delay = 0.1): + time.sleep(delay) + url = x.url + headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0", + 'Accept-Encoding':'gzip, deflate, br',} + response = requests.get(url = url,headers=headers) + if response.status_code == 200: + try: + text = response.content.decode('GBK') + html = etree.HTML(text) + + # clean content + content_list = html.xpath("//*[@id='content']//text()") + content_list = [content.strip('.\t').strip('.\n').strip('.\r') for content in content_list] + content_list = [content for content in content_list if len(content) != 0] + content = "".join(content_list) + except: + return "can't get content" + else: + return "can't get content" + + pbar.update(1) + + return content + + def clean_data(self): + pass + + def transfer_standard_date_to_nonstandard(self,date): + pass \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/datasets/__init__.py b/FinNLP/finnlp/data_sources/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..db15830c26ce285b50eca41fcb1b7fab9f0a4875 --- /dev/null +++ b/FinNLP/finnlp/data_sources/datasets/__init__.py @@ -0,0 +1 @@ +from load_dataset import load_dataset \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/datasets/load_dataset.py b/FinNLP/finnlp/data_sources/datasets/load_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..121bf7a5a32b376ecca47b8e9f0b2fba3c0546ca --- /dev/null +++ b/FinNLP/finnlp/data_sources/datasets/load_dataset.py @@ -0,0 +1,29 @@ +import datasets +import pandas as pd +from tqdm.notebook import tqdm +import json +import os + +def load_dataset(dataset_name, **kwargs): + if dataset_name == "Stocknet": + root_path = r"../../../stocknet-dataset/tweet/raw" + stock_lists = os.listdir(root_path) + all = pd.DataFrame() + for stock in tqdm(stock_lists, desc="Loading Stocknet dataset..."): + stock_path = os.path.join(root_path, stock) + date_files = os.listdir(stock_path) + for date in date_files: + with open(os.path.join(stock_path, date_files[0])) as f: + json_list = f.readlines() + tmp_json = [] + for json_str in json_list: + tmp_json.append(json.loads(json_str)) + tmp_json = pd.DataFrame(tmp_json) + all = pd.concat([all, tmp_json], axis=0) + all = all.reset_index(drop=True) + all = datasets.Dataset.from_pandas(all) + return all + + else: + raise NotImplementedError("Only support Stocknet dataset for now") + diff --git a/FinNLP/finnlp/data_sources/earning_calls/__init__.py b/FinNLP/finnlp/data_sources/earning_calls/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..79350f905f2f06da427906524a4910fc8bfe229f --- /dev/null +++ b/FinNLP/finnlp/data_sources/earning_calls/__init__.py @@ -0,0 +1 @@ +from finnlp.data_sources.earning_calls.main import EarningCallTranscripts diff --git a/FinNLP/finnlp/data_sources/earning_calls/main.py b/FinNLP/finnlp/data_sources/earning_calls/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f2ad7b4f9095dbf198997541b2d8d135aa3289b9 --- /dev/null +++ b/FinNLP/finnlp/data_sources/earning_calls/main.py @@ -0,0 +1,45 @@ +from datetime import datetime +from typing import List + +try: + from finnlp.data_sources.earning_calls.utils import get_earning_transcripts +except ImportError: + from utils import get_earning_transcripts + + +class EarningCallTranscripts(): + def __init__(self, year: int, ticker: str, quarter: str): + """Get the earning call transcripts for a given company, in a given year and quarter + + Args: + year (int): Year of the transcript + ticker (str): ticker symbol of the stock + quarter (str): quarter + """ + curr_year = datetime.now().year + assert year <= curr_year, "The year should be less than current year" + + assert quarter in [ + "Q1", + "Q2", + "Q3", + "Q4", + ], 'The quarter should from the list ["Q1","Q2","Q3","Q4"]' + self.year = year + self.ticker = ticker + self.quarter = quarter + + def load_data(self): + resp_dict, speakers_list = get_earning_transcripts( + self.quarter, self.ticker, self.year + ) + return { + "text":resp_dict["content"], + "metadata":{ + "ticker": resp_dict["symbol"], + "quarter": "Q" + str(resp_dict["quarter"]), + "date_time": resp_dict["date"], + "speakers_list": speakers_list, + }, + } + diff --git a/FinNLP/finnlp/data_sources/earning_calls/utils.py b/FinNLP/finnlp/data_sources/earning_calls/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9c2410f69f9855c24df0ed525b667b2d4c464305 --- /dev/null +++ b/FinNLP/finnlp/data_sources/earning_calls/utils.py @@ -0,0 +1,58 @@ +from tenacity import retry, stop_after_attempt, wait_random_exponential +import requests +import json +from datetime import datetime +import re +from typing import List + + +def correct_date(yr, dt): + """Some transcripts have incorrect date, correcting it + + Args: + yr (int): actual + dt (datetime): given date + + Returns: + datetime: corrected date + """ + dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") + if dt.year != yr: + dt = dt.replace(year=yr) + return dt.strftime("%Y-%m-%d %H:%M:%S") + + +def extract_speakers(cont: str) -> List[str]: + """Extract the list of speakers + + Args: + cont (str): transcript content + + Returns: + List[str]: list of speakers + """ + pattern = re.compile(r"\n(.*?):") + matches = pattern.findall(cont) + + return list(set(matches)) + + +@retry(wait=wait_random_exponential(min=1, max=5), stop=stop_after_attempt(2)) +def get_earning_transcripts(quarter: str, ticker: str, year: int): + """Get the earnings transcripts + + Args: + quarter (str) + ticker (str) + year (int) + """ + response = requests.get( + f"https://discountingcashflows.com/api/transcript/{ticker}/{quarter}/{year}/", + auth=("user", "pass"), + ) + + resp_text = json.loads(response.text) + speakers_list = extract_speakers(resp_text[0]["content"]) + corrected_date = correct_date(resp_text[0]["year"], resp_text[0]["date"]) + resp_text[0]["date"] = corrected_date + return resp_text[0], speakers_list diff --git a/FinNLP/finnlp/data_sources/news/__init__.py b/FinNLP/finnlp/data_sources/news/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/data_sources/news/_base.py b/FinNLP/finnlp/data_sources/news/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..28893c5e2b75002465e2b0553f8ec2940a91f4bc --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/_base.py @@ -0,0 +1,22 @@ +from finnlp.data_sources._base import FinNLP_Downloader + +class News_Downloader(FinNLP_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + pass + + def download_date_range(self, start_date, end_date, stock = None): + pass + + def download_streaming(self, stock = None): + pass + + def clean_data(self): + pass + + def _gather_one_part(self, date, stock = None, delay = 0.1): + pass + + def _gather_content(self): + pass diff --git a/FinNLP/finnlp/data_sources/news/akshare_cctv.py b/FinNLP/finnlp/data_sources/news/akshare_cctv.py new file mode 100644 index 0000000000000000000000000000000000000000..69e4e2178a41f70391d6c3559cc1718452dd5bb0 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/akshare_cctv.py @@ -0,0 +1,29 @@ +import pandas as pd +import akshare as ak +from tqdm.notebook import tqdm +from finnlp.data_sources.news._base import News_Downloader + + +class Akshare_cctv(News_Downloader): + + def __init__(self, args={}): + pass + + def download_news(self, start_date, end_date, stock="all"): + self.date_list = pd.date_range(start_date, end_date) + res = pd.DataFrame() + for date in tqdm(self.date_list): + tmp = self.gather_one_day_news(date) + res = pd.concat([res, tmp]) + self.dataframe = res + + def clean_data(self): + pass + + def gather_one_day_news(self, date, stock="all", delay=0.1): + date = self.transfer_standard_date_to_nonstandard(date) + res = ak.news_cctv(date=date) + return res + + def transfer_standard_date_to_nonstandard(self, date): + return date.strftime("%Y%m%d") \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/news/alliancenews_streaming.py b/FinNLP/finnlp/data_sources/news/alliancenews_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..37ddddb266b4e79c7c389f472f5371169323c7dd --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/alliancenews_streaming.py @@ -0,0 +1,58 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents + +## Download Alliance News from Interactive Investor (https://www.ii.co.uk/news/source/alliance-news) + +class AllianceNews_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "appple", rounds = 3, delay = 0.5): + # download first page + url = "https://api-prod.ii.co.uk/api/1/content/articles" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + 'Referer': 'https://www.ii.co.uk/news/source/alliance-news', + 'Ii-Consumer-Type': 'web.public' + } + params = { + 'pageSize': '12', + 'source': 'ALLIANCE', + } + res = requests.get(url = url, headers= headers, params = params) + if res.status_code != 200: + print(f"Connection Error: {res.status_code}") + return f"Connection Error: {res.status_code}" + + res = json.loads(res.text) + nextId = res["nextId"] + tmp = pd.DataFrame(res["results"]) + self.dataframe = pd.concat([self.dataframe, tmp]) + + # download other pages + for i in range(rounds-1): + params["nextId"] = nextId + res = requests.get(url = url, headers= headers, params = params) + if res.status_code != 200: + break + + res = json.loads(res.text) + if "nextId" in res.keys(): + nextId = res["nextId"] + else: + break + + tmp = pd.DataFrame(res["results"]) + self.dataframe = pd.concat([self.dataframe, tmp]) diff --git a/FinNLP/finnlp/data_sources/news/cnbc_streaming.py b/FinNLP/finnlp/data_sources/news/cnbc_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..81eb0e3d3906c62486a0531bbff464199c952633 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/cnbc_streaming.py @@ -0,0 +1,51 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents + +class CNBC_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5): + url = "https://api.queryly.com/cnbc/json.aspx" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + 'Referer':'https://www.cnbc.com/', + } + print("Downloading ...", end = ' ') + for page in range(rounds): + params = { + 'queryly_key': '31a35d40a9a64ab3', + 'query': keyword, + 'endindex': page * 10, + 'batchsize': '10', + 'callback': '', + 'showfaceted': 'false', + 'timezoneoffset': '-480', + 'facetedfields': 'formats', + 'facetedkey': 'formats|', + 'facetedvalue': '!Press Release|', + 'sort': 'date', + 'additionalindexes': '4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28', + } + res = requests.get(url = url, headers = headers, params = params) + if res.status_code != 200: + break + res = json.loads(res.text) + tmp = pd.DataFrame(res['results']) + self.dataframe = pd.concat([self.dataframe, tmp]) + + print(page, end = ' ') + + time.sleep(delay) diff --git a/FinNLP/finnlp/data_sources/news/eastmoney_streaming.py b/FinNLP/finnlp/data_sources/news/eastmoney_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..400defd8e3a62b1e181df94ede2df2be16a544f9 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/eastmoney_streaming.py @@ -0,0 +1,69 @@ +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +from finnlp.data_sources.news._base import News_Downloader + + +class Eastmoney_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_stock(self, stock = "600519", rounds = 3): + print( "Geting pages: ", end = "") + if rounds > 0: + for r in range(rounds): + br = self._gather_pages(stock, r) + if br == "break": + break + else: + r = 1 + error_count = 0 + while 1: + br = self._gather_pages(stock, r) + if br == "break": + break + elif br == "Error": + error_count +=1 + if error_count>10: + print("Connection Error") + r += 1 + print( f"Get total {r+1} pages.") + self.dataframe = self.dataframe.reset_index(drop = True) + + def _gather_pages(self, stock, page): + print( page, end = " ") + url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", + } + + requests.DEFAULT_RETRIES = 5 # 增加重试连接次数 + s = requests.session() + s.keep_alive = False # 关闭多余连接 + + response = self._request_get(url, headers=headers) + if response.status_code != 200: + return "Error" + + # gather the comtent of the first page + page = etree.HTML(response.text) + trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr') + have_one = False + for item in trs: + have_one = True + read_amount = item.xpath("./td[1]//text()")[0] + comments = item.xpath("./td[2]//text()")[0] + title = item.xpath("./td[3]/div/a//text()")[0] + content_link = item.xpath("./td[3]/div/a/@href")[0] + author = item.xpath("./td[4]//text()")[0] + time = item.xpath("./td[5]//text()")[0] + tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T + columns = [ "read amount", "comments", "title", "content link", "author", "create time" ] + tmp.columns = columns + self.dataframe = pd.concat([self.dataframe, tmp]) + #print(title) + if have_one == False: + return "break" diff --git a/FinNLP/finnlp/data_sources/news/finnhub_date_range.py b/FinNLP/finnlp/data_sources/news/finnhub_date_range.py new file mode 100644 index 0000000000000000000000000000000000000000..c50d5aa447d32bda56a98415d5b404df1a9ea57d --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/finnhub_date_range.py @@ -0,0 +1,222 @@ +import warnings +warnings.filterwarnings("ignore") + +from finnlp.data_sources.news._base import News_Downloader + +from tqdm import tqdm +from lxml import etree +import pandas as pd +import requests +import finnhub +import time +import json + +class Finnhub_Date_Range(News_Downloader): + def __init__(self, args = {}): + super().__init__(args) + assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard" + self.finnhub_client = finnhub.Client(api_key=args["token"]) + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): + self.date_list = pd.date_range(start_date,end_date) + self.dataframe = pd.DataFrame() + + days_each_time = 4 + date_list = self.date_list + # cal total lenth + if len(date_list)%days_each_time == 0: + total = len(date_list)//days_each_time + else: + total = len(date_list)//days_each_time+1 + + with tqdm(total=total, desc= "Downloading Titles") as bar: + while len(date_list): + tmp_date_list = date_list[:days_each_time] + date_list = date_list[days_each_time:] + tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d") + tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d") + res = self._gather_one_part(tmp_start_date,tmp_end_date,stock = stock ) + self.dataframe = pd.concat([self.dataframe,res]) + bar.update(1) + + # res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date) + self.dataframe.datetime = pd.to_datetime(self.dataframe.datetime,unit = "s") + self.dataframe = self.dataframe.reset_index(drop = True) + + def _gather_one_part(self, start_date, end_date, stock = "AAPL", delay = 1): + res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date) + time.sleep(delay) + return pd.DataFrame(res) + + def gather_content(self, delay = 0.01): + pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents") + self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1) + + def _gather_content_apply(self,x, pbar, delay = 0.01): + time.sleep(delay) + url = x.url + source = x.source + response = self._request_get(url = url) + # response = self._request_get(url= url, headers= headers) + pbar.update(1) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + try: + # Yahoo Finance + if source == "Yahoo": + page = page.xpath("/html/body/div[3]/div[1]/div/main/div[1]/div/div/div/div/article/div/div/div/div/div/div[2]/div[4]") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # Reuters + elif source == "Reuters": + page = page.xpath("/html/body/div[1]/div[3]/div/main/article/div[1]/div[2]/div/div/div[2]") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # SeekingAlpha + elif source == "SeekingAlpha": + page = page.xpath("/html/body/div[2]/div/div[1]/main/div/div[2]/div/article/div/div/div[2]/div/section[1]/div/div/div") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # PennyStocks + elif source == "PennyStocks": + page = page.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/article/div[2]/div[2]/div") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # MarketWatch + elif source == "MarketWatch": + page = page.xpath('//*[@id="js-article__body"]') + content = page[0].xpath(".//text()") + content = "".join(content) + while " " in content: + content = content.replace(" ", " ") + while "\n \n"in content: + content = content.replace("\n \n", " ") + while "\n "in content: + content = content.replace("\n ", " ") + return content + + # Seeking Alpha + elif source == "Seeking Alpha": + # first get Seeking Alpha URL + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # Alliance News + elif source == "Alliance News": + page = page.xpath('//*[@id="comtext"]') + content = page[0].xpath(".//text()") + content = [c for c in content if not str(c).startswith("\r\n")] + content = "\n".join(content) + return content + + # Thefly.com + elif source == "Thefly.com": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new, verify= False) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + page = page.xpath('/html/body/div[2]/div/div/div/div/div[2]/div[2]//text()') + # content = page[0].xpath(".//text()") + # content = [c for c in content if not str(c).startswith("\r\n")] + content = "\n".join(page) + content = content.replace("\r\n","") + + return content + + # TalkMarkets + elif source == "TalkMarkets": + return "Not supported yet" + + # CNBC + elif source == "CNBC": + page = page.xpath('/html/body/div[3]/div/div[1]/div[3]/div/div/div/div[3]/div[1]/div[2]/div[3]//text()') + content = "\n".join(page) + + return content + + # GuruFocus + elif source == "GuruFocus": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + page = page.xpath('/html/body/div[1]/div/section/section/main/section/main/div[1]/div/div/div[1]/div[2]/div//text()') + page_new = [] + for c in page: + while "\n" in c: + c = c.replace("\n","") + while " "in c: + c = c.replace(" ","") + + page_new.append(c) + + content = "\n".join(page_new) + + return content + + # InvestorPlace + elif source == "InvestorPlace": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + page = page.xpath('//script[@type="application/ld+json"]')[1] + content = page.xpath(".//text()") + content = json.loads(content[0]) + content = content["articleBody"] + + return content + + # TipRanks + elif source == "TipRanks": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + # /html/body/div[1]/div[2]/div[5]/div[2]/div[2]/div/div[6]/div/article/p[1]/p + page = page.xpath('/html/body/div[1]/div[1]/div[4]/div[2]/div[2]/div[1]/div[6]//text()') + # content = page[0].xpath('.//text()') + page = [p.replace("\n","") for p in page] + content = "".join(page) + return content + + else: + return "Not supported yet" + + except: + return "Error" + diff --git a/FinNLP/finnlp/data_sources/news/fmp_streaming.py b/FinNLP/finnlp/data_sources/news/fmp_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..e78c255b69b49a6ae6b26d84dc0da002dea82f4b --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/fmp_streaming.py @@ -0,0 +1,24 @@ +import json +import requests +import pandas as pd +from tqdm.notebook import tqdm + +df = pd.read_csv("NAS.csv", index_col=0) +stock_list = df.index.to_list() + +api_key = YOUR_API_KEY # You may find your api key here https://site.financialmodelingprep.com/developer/docs/api-keys + +all = pd.DataFrame() +for stock in tqdm(stock_list): + for page in tqdm(range(500)): + url = f"https://financialmodelingprep.com/api/v3/stock_news?tickers={stock}&page={page+1}&apikey={api_key}" + res = requests.get(url) + res = json.loads(res.text) + if len(res) == 0: + break + else: + res = pd.DataFrame(res) + all = pd.concat([all, res]) + +all = all.reset_index(drop=True) +all.to_csv("dataset_more.csv") \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/news/gurufocus_streaming.py b/FinNLP/finnlp/data_sources/news/gurufocus_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..9a609f87d7316c4b1dc9e548247ea3452d7ad4d6 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/gurufocus_streaming.py @@ -0,0 +1,55 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents +# 2. More pages + +class GuruFocus_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "AAPL", rounds = 3, delay = 0.5): + url = f"https://www.gurufocus.com/stock/{keyword}/article" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + } + res = requests.get(url = url, headers= headers) + if res.status_code != 200: + print(f"Connection Error: {res.status_code}") + return f"Connection Error: {res.status_code}" + + res = etree.HTML(res.text) + divs = res.xpath("/html/body/div[1]/div/section/section/main/div[1]/div[4]/div[1]/div/div")[1:] + titles = [] + views = [] + sources = [] + datetimes = [] + for div in divs: + # title + title = " ".join(div.xpath("./div[1]/h4/a//text()")) + title = title.replace("\n", '').strip(" ") + titles.append(title) + + # summary + summary = " ".join(div.xpath("div[5]/text()")).replace('\n','').strip(' ') + view ,source, datetime = summary.split(' \xa0\xa0 ') + views.append(view) + sources.append(source) + datetimes.append(datetime) + + tmp = pd.DataFrame([titles, views, sources, datetimes]).T + tmp.columns = ["title", "view" ,"source", "datetime"] + self.dataframe = pd.concat([self.dataframe, tmp]) + + print("Only support first page now!") + diff --git a/FinNLP/finnlp/data_sources/news/investorplace_streaming.py b/FinNLP/finnlp/data_sources/news/investorplace_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..89a92cc6baecf03a29c1e9a97e37924997ab0a37 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/investorplace_streaming.py @@ -0,0 +1,79 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents + +class InvestorPlace_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5): + url = 'https://investorplace.com/search/' + + headers = { + "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + } + print("Downloading ...", end = ' ') + for page in range(rounds): + params = { + 'q': keyword, + "pg": page, + } + res = requests.get(url = url, params=params, headers=headers) + if res.status_code != 200: + break + + res = etree.HTML(res.text) + div_list = res.xpath("/html/body/main/section/div/div/div/div[2]/div[1]/div[1]/div") + divs = [] + + for div in div_list: + divs += div.xpath("./div") + + titles = [] + times = [] + authors = [] + summaries = [] + + for div in divs: + try: + title = div.xpath('./h2/a//text()')[0] + except: + title = '' + try: + time_ = div.xpath('div/time//text()')[0].replace('\n','').replace('\t','') + except: + time_ = '' + try: + author = div.xpath('div/span/a/text()')[0].replace('\n','').replace('\t','') + except: + author = '' + try: + summary = div.xpath('p/text()')[0].replace('\n','').replace('\t','') + except: + summary = '' + + titles.append(title) + times.append(time_) + authors.append(author) + summaries.append(summary) + + titles.append(title) + + tmp = pd.DataFrame([titles, times, authors, summaries]).T + tmp.columns = ['title', 'time', 'author', 'summary'] + self.dataframe = pd.concat([self.dataframe, tmp]) + + print(page, end = ' ') + + time.sleep(delay) diff --git a/FinNLP/finnlp/data_sources/news/marketwatch_date_range.py b/FinNLP/finnlp/data_sources/news/marketwatch_date_range.py new file mode 100644 index 0000000000000000000000000000000000000000..b00bdfa4179ca9f1033870e8fff772e07c6a49a3 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/marketwatch_date_range.py @@ -0,0 +1,70 @@ +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. More pages +# 2. Contents + +class MarketWatch_Date_Range(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_date_range_search(self, start_date , end_date, keyword = "apple", delay = 0.5): + # download first page + self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date) + + # download the following pages + # self._download_other_pages(keyword) + print("Only support the first page now!") + + def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None): + url = "https://www.marketwatch.com/search" + params = { + 'q': keyword, + 'ts': '5', + 'tab': 'All News', + 'sd': start_date, + 'ed': end_date, + } + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + } + + res = requests.get(url = url, headers= headers, params=params) + if res.status_code != 200: + print(f'Connection Error: {res.status_code}') + return f'Connection Error: {res.status_code}' + + res = etree.HTML(res.text) + divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div") + titles = [] + times = [] + authors = [] + for div in divs: + # title + title = div.xpath("./div/h3/a/text()") + # time + time_ = div.xpath("./div/div/span[1]/text()") + # author + author = div.xpath("./div/div/span[2]/text()") + + if len(title)>0: + titles.append(' '.join(title).replace("\n","").strip(" ")) + times.append(' '.join(time_)) + authors.append(' '.join(author)) + + # concat results + tmp = pd.DataFrame([titles, times, authors]).T + tmp.columns = ["title", "time", "author"] + self.dataframe = pd.concat([self.dataframe, tmp]) + + # sleep + time.sleep(delay) + \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/news/marketwatch_streaming.py b/FinNLP/finnlp/data_sources/news/marketwatch_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..677dc895153fab514684a74698a759a281a6a55e --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/marketwatch_streaming.py @@ -0,0 +1,140 @@ +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. More pages +# 2. Contents + + +class MarketWatch_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5): + # download first page + self._download_first_page(keyword, delay = delay) + + # download the following pages + # self._download_other_pages(keyword) + print("Only support the first page now!") + + def download_date_range_search(self, start_date , end_date, keyword = "apple", rounds = 1000, delay = 0.5): + # download first page + self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date) + + # download the following pages + # self._download_other_pages(keyword) + print("Only support the first page now!") + + def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None): + url = "https://www.marketwatch.com/search" + params = { + 'q': keyword, + 'ts': '0', + 'tab': 'All News', + 'sd': start_date, + 'ed': end_date, + } + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + } + + res = requests.get(url = url, headers= headers, params=params) + if res.status_code != 200: + print(f'Connection Error: {res.status_code}') + return f'Connection Error: {res.status_code}' + + res = etree.HTML(res.text) + divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div") + titles = [] + times = [] + authors = [] + for div in divs: + # title + title = div.xpath("./div/h3/a/text()") + # time + time_ = div.xpath("./div/div/span[1]/text()") + # author + author = div.xpath("./div/div/span[2]/text()") + + if len(title)>0: + titles.append(' '.join(title).replace("\n","").strip(" ")) + times.append(' '.join(time_)) + authors.append(' '.join(author)) + + # concat results + tmp = pd.DataFrame([titles, times, authors]).T + tmp.columns = ["title", "time", "author"] + self.dataframe = pd.concat([self.dataframe, tmp]) + + # sleep + time.sleep(delay) + + + + +class MarketWatch_Date_Range(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_date_range_search(self, start_date , end_date, keyword = "apple", delay = 0.5): + # download first page + self._download_first_page(keyword, delay = delay, start_date = start_date, end_date = end_date) + + # download the following pages + # self._download_other_pages(keyword) + print("Only support the first page now!") + + def _download_first_page(self, keyword = "apple", delay = 0.5, start_date = None, end_date = None): + url = "https://www.marketwatch.com/search" + params = { + 'q': keyword, + 'ts': '0', + 'tab': 'All News', + 'sd': start_date, + 'ed': end_date, + } + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + } + + res = requests.get(url = url, headers= headers, params=params) + if res.status_code != 200: + print(f'Connection Error: {res.status_code}') + return f'Connection Error: {res.status_code}' + + res = etree.HTML(res.text) + divs = res.xpath("body/main/div/div[2]/div[2]/div[2]/div[2]/mw-tabs/div[2]/div[1]/div/div[1]/div") + titles = [] + times = [] + authors = [] + for div in divs: + # title + title = div.xpath("./div/h3/a/text()") + # time + time_ = div.xpath("./div/div/span[1]/text()") + # author + author = div.xpath("./div/div/span[2]/text()") + + if len(title)>0: + titles.append(' '.join(title).replace("\n","").strip(" ")) + times.append(' '.join(time_)) + authors.append(' '.join(author)) + + # concat results + tmp = pd.DataFrame([titles, times, authors]).T + tmp.columns = ["title", "time", "author"] + self.dataframe = pd.concat([self.dataframe, tmp]) + + # sleep + time.sleep(delay) + \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/news/pennystocks_streaming.py b/FinNLP/finnlp/data_sources/news/pennystocks_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..77bd4aa6ee76c7a13b829081587a6703f03846ee --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/pennystocks_streaming.py @@ -0,0 +1,81 @@ +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time as time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. More Pages +# 2. Contents + +class PennyStocks_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 2): + # establish session + self._connect_session() + + # download first page + self._download_first_page(keyword, delay = delay) + + # download the following pages + # self._download_other_pages(keyword) + print("Only support the first page now!") + + + def _connect_session(self): + # since the server will check cookies, we need first + # request the main site withour cookies, then finish + # searching for the stock information we want. + self.session = requests.session() + first_url = "https://pennystocks.com/" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + } + print("Requesting https://pennystocks.com ...", end = " ") + res = self.session.get(headers = headers, url = first_url) + if res.status_code !=200: + raise ConnectionError("Can't request https://pennystocks.com. Please check your connection or report this issue on Github") + + print("succeed!") + + def _download_first_page(self, keyword = "apple", max_retry = 5, delay = 2): + url = f"https://pennystocks.com/?s={keyword}" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + } + res = self.session.get(url = url, headers = headers) + res = etree.HTML(res.text) + articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article") + # not sure why but this really works + + while max_retry and len(articles) == 0: + import time + time.sleep(delay) + print("Gathering again ..", end = ' ') + res = requests.get(url = url, headers = headers, cookies=self.session.cookies) + res = etree.HTML(res.text) + articles = res.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/div/div[1]/div/article") + max_retry -= 1 + print(f"Remaining Retry: {max_retry}") + + + for a in articles: + title = a.xpath("./header/h2/a//text()")[0] + time = a.xpath("./div[3]/div/div/ul/li[1]/text()")[0] + brief = a.xpath("./div[3]/div/div/text()")[0] + reading_time = a.xpath("./div[3]/div/div/ul/li[2]/text()")[0] + columns = ["title", "time", "brief", "reading_time"] + tmp = pd.DataFrame([[title, time, brief, reading_time]], columns=columns) + self.dataframe = pd.concat([self.dataframe, tmp]) + + + def _download_other_pages(self, keyword = "apple"): + pass + + diff --git a/FinNLP/finnlp/data_sources/news/reuters_streaming.py b/FinNLP/finnlp/data_sources/news/reuters_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..454de47d656dd156871b2cb4c7199d7f4863dc07 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/reuters_streaming.py @@ -0,0 +1,55 @@ +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents + + +class Reuters_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5): + news_per_page = 20 + url = "https://www.reuters.com/pf/api/v3/content/fetch/articles-by-search-v2" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", + "Referer": "https://www.reuters.com/site-search/?query=AAPL&sort=newest&offset=0" + } + + print( "Geting pages: ", end = "") + for i in range(rounds): + offset = i * news_per_page + params = { + "query": f'{{"keyword":"{keyword}","offset":{offset},"orderby":"display_date:desc","size":20,"website":"reuters"}}', + "d": "144", + "_website": "reuters", + } + response = self._request_get(url, headers=headers, params = params) + + # check connection error + if response.status_code != 200: + return "Error" + + # Phrase response + response = json.loads(response.text) + + # check whether return content + if response["statusCode"] != 200: + print("Early Stopping") + break + + # make pandas DataFrame + tmp = pd.DataFrame(response["result"]["articles"]) + self.dataframe = pd.concat([self.dataframe, tmp]) + + # finish + print( i+1, end = " ") + time.sleep(delay) diff --git a/FinNLP/finnlp/data_sources/news/seekingalpha_date_range.py b/FinNLP/finnlp/data_sources/news/seekingalpha_date_range.py new file mode 100644 index 0000000000000000000000000000000000000000..28f788a62c4b1e0df9adb44647806cd6ba220926 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/seekingalpha_date_range.py @@ -0,0 +1,91 @@ +import warnings +warnings.filterwarnings("ignore") + +import json +import requests +import pandas as pd +from lxml import etree +from tqdm import tqdm +from datetime import datetime + +from finnlp.data_sources.news._base import News_Downloader + +class SeekingAlpha_Date_Range(News_Downloader): + def __init__(self, args = {}): + super().__init__(args) + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL", proxies = None): + self.dataframe = pd.DataFrame() + start_timestamp = int(datetime.strptime(start_date+'-13', "%Y-%m-%d-%H").timestamp()) + end_timestamp = int(datetime.strptime(end_date+'-13', "%Y-%m-%d-%H").timestamp()) + # Downloading First Page + data, totalpages = self._gather_by_page(start_timestamp, end_timestamp, stock, 1, proxies) + self.dataframe = pd.concat([self.dataframe, data]) + + # Downloading Other Pages + with tqdm(total=totalpages, desc= "Downloading Titles") as bar: + bar.update(1) + for page in range(2, totalpages+1): + data,_ = self._gather_by_page(start_timestamp, end_timestamp, stock, page, proxies) + self.dataframe = pd.concat([self.dataframe, data]) + bar.update(1) + self.dataframe = self.dataframe.reset_index(drop = True) + + def _gather_by_page(self, start_timestamp, end_timestamp, stock, page = 1, proxies = None): + url = f"https://seekingalpha.com/api/v3/symbols/{stock}/news?filter[since]={start_timestamp}&filter[until]={end_timestamp}&id={stock}&include=author%2CprimaryTickers%2CsecondaryTickers%2Csentiments&isMounting=true&page[size]=40&page[number]={page}" + headers = { + 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0', + 'Referer':f'https://seekingalpha.com/symbol/aapl/news?from=2009-12-31T16%3A00%3A00.000Z&to=2022-01-01T15%3A59%3A59.999Z' + } + response = requests.get(url, headers=headers, proxies=proxies) + if response.status_code != 200: + print(f"stock: {stock}, page: {page} went wrong!") + return pd.DataFrame(), 1 + else: + res = json.loads(response.text) + data = pd.DataFrame(res["data"]) + # make new features + new_columns = ["publishOn", "isLockedPro", "commentCount", "gettyImageUrl", "videoPreviewUrl", "themes", "title", "isPaywalled"] + data[new_columns] = data.apply(lambda x:list(x.attributes.values()), axis = 1,result_type ="expand" ) + new_columns = ["author", "sentiments", "primaryTickers", "secondaryTickers", "otherTags"] + data[new_columns] = data.apply(lambda x:list(x.relationships.values()), axis = 1,result_type ="expand" ) + + # total pages + totalpages = res["meta"]["page"]["totalPages"] + return data, totalpages + + + def obtain_content(self, parallel = False, proxies = None): + if parallel: + import os + from pandarallel import pandarallel + pandarallel.initialize(nb_workers=os.cpu_count()) + self.dataframe['content'] = self.dataframe.parallel_apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1) + else: + self.dataframe['content'] = self.dataframe.apply(lambda x: self._obtain_content(x, proxies = proxies), axis = 1) + + + def _obtain_content(self, x, proxies = None): + url = x['links']['self'] + url = f"https://seekingalpha.com{url}" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0' + } + res = requests.get(url, headers=headers, proxies=proxies) + if res.status_code != 200: + return '' + else: + resp = etree.HTML(res.text) + resp = resp.xpath('//script[5]//text()') + resp = resp[0].split('window.SSR_DATA = ')[1] + resp = resp[:-1] + resp = json.loads(resp) + content = resp['article']['response']['data']['attributes']['content'] + content = etree.HTML(content) + content = content.xpath('//text()') + content = [c if c!= ' ' else '\n' for c in content] + content = ''.join(content) + content = content.strip() + return content + + diff --git a/FinNLP/finnlp/data_sources/news/sina_finance_date_range.py b/FinNLP/finnlp/data_sources/news/sina_finance_date_range.py new file mode 100644 index 0000000000000000000000000000000000000000..bc00c0820b32abd1f9731b30466c8738b6dadd42 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/sina_finance_date_range.py @@ -0,0 +1,86 @@ +import json +import pytz +import time +import requests +import pandas as pd +import numpy as np +from lxml import etree +from tqdm import tqdm +from finnlp.data_sources.news._base import News_Downloader + +class Sina_Finance_Date_Range(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_date_range_all(self, start_date, end_date): + self.date_list = pd.date_range(start_date, end_date) + for date in tqdm(self.date_list, desc= "Downloading Titles..."): + tmp = self._gather_one_day(date) + self.dataframe = pd.concat([self.dataframe, tmp]) + self.dataframe = self.dataframe.reset_index(drop = True) + + def _gather_one_day(self, date, delay = 0.1): + end_timestamp = pd.to_datetime(f"{date} 16:00:00").timestamp() + start_timestamp = end_timestamp - 60 * 60 * 24 + + res = pd.DataFrame() + for page in range(100): + url = f"https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=2516&etime={start_timestamp}&stime={end_timestamp}&ctime={end_timestamp}&date={date}&k=&num=50&page={page}" + response = self._request_get(url = url) + if response is not None: + response.encoding = 'unicode' + text = response.text + text = json.loads(text, strict=True) + text = text["result"] + text = text["data"] + if len(text) == 0: + break + + for i in text: + for ii in i.keys(): + i[ii] = [i[ii]] + tmp = pd.DataFrame(i) + res = pd.concat([res, tmp]) + time.sleep(delay) + + if res.shape[0] != 0: + res.ctime = pd.to_datetime(res.ctime, unit="s", utc=True) + res.mtime = pd.to_datetime(res.mtime, unit="s", utc=True) + res.intime = pd.to_datetime(res.intime, unit="s", utc=True) + + tz = pytz.timezone("Asia/Shanghai") + res.ctime = [t.astimezone(tz) for t in res.ctime] + res.mtime = [t.astimezone(tz) for t in res.mtime] + res.intime = [t.astimezone(tz) for t in res.intime] + + return res + + def gather_content(self, delay = 0.01): + pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents") + self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1) + + def _gather_content_apply(self,x, pbar, delay = 0.01): + url = x.url + response = self._request_get(url=url) + + if response is not None: + # process + response.encoding = 'unicode' + text = response.text + page = etree.HTML(text) + page = page.xpath("//*[@id='artibody']/p") + page = [p.xpath(".//text()") for p in page] + page = [''.join(p) for p in page] + content = "\n".join(page) + content = content.replace("\u3000","") + else: + content = np.nan + + # update + pbar.update(1) + time.sleep(delay) + + return content + \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/news/talkmarkets_streaming.py b/FinNLP/finnlp/data_sources/news/talkmarkets_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..da73fbf512d866e7989d55b368c67d46b3956481 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/talkmarkets_streaming.py @@ -0,0 +1,95 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents + +class TalkMarkets_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "appple", rounds = 3, delay = 0.5): + # 1. obtain cx + cx = self._obtain_cx(keyword) + + # 2. obtain ces token + ces_token = self._obtain_cse_token(cx) + + # 3. get content (Due to limit of the platform, the max rouund is 10, about 100 news) + print("Downloading...", end = ' ') + for i in range(rounds): + url = "https://cse.google.com/cse/element/v1" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + } + params = { + 'rsz': 'filtered_cse', + 'num': '20', + 'hl': 'en', + 'source': 'gcsc', + 'gss': '.com', + 'start': i*20, + 'cselibv': '827890a761694e44', + 'cx': cx, + 'q': 'apple', + 'safe': 'off', + 'cse_tok': ces_token, + 'sort': 'date', + 'exp': 'csqr,cc', + 'callback': 'google.search.cse.api1861', + } + res = requests.get(url = url, headers= headers, params = params) + if res.status_code != 200: + break + + res = eval(res.text[34:-2]) + tmp = pd.DataFrame(res["results"]) + self.dataframe = pd.concat([self.dataframe, tmp]) + + time.sleep(delay) + print(i, end = ' ') + + def _obtain_cx(self, keyword): + url = "https://talkmarkets.com/search" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + } + params = { + "tab": "General", + "searchQuery": keyword, + } + res = requests.get(url = url, headers= headers, params = params) + if res.status_code != 200: + print(f"Connection Error: {res.status_code}") + return f"Connection Error: {res.status_code}" + + res = etree.HTML(res.text) + cx = res.xpath('.//script[@type="text/javascript"][1]/text()')[1][40:73] + return cx + + def _obtain_cse_token(self, cx, ): + url = "https://cse.google.com/cse.js" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + } + params = { + "cx": cx, + } + res = requests.get(url = url, headers= headers, params = params) + if res.status_code != 200: + print(f"Connection Error: {res.status_code}") + return f"Connection Error: {res.status_code}" + + text = res.text + ces_token = text[5744:5786] + return ces_token + diff --git a/FinNLP/finnlp/data_sources/news/thefly_streaming.py b/FinNLP/finnlp/data_sources/news/thefly_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..95c95185b439de569d4de0daa80fc3150c82729b --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/thefly_streaming.py @@ -0,0 +1,87 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents +# 2. More pages + +class TheFly_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "AAPL",end_date = None, rounds = 3, delay = 0.5): + # download first page + self._download_first_page(keyword, delay = delay, end_date = end_date) + + # download the following pages + # self._download_other_pages(keyword) + print("Only support the first page now!") + + def _download_first_page(self, keyword = "AAPL", delay = 0.5, end_date = None): + url = "https://thefly.com/news.php" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + } + params = { + 'fecha': end_date, + 'market_stories': 'on', + 'hot_stocks_filter': 'on', + 'rumors_filter': 'on', + 'general_news_filter': 'on', + 'periodicals_filter': 'on', + 'earnings_filter': 'on', + 'technical_analysis_filter': 'on', + 'options_filter': 'on', + 'syndicates_filter': 'on', + 'onthefly': 'on', + 'insight_filter': 'on', + 'market_mover_filter': 'on', + 'e_inter_filter': 'on', + 'mid_wrap_filter': 'on', + 'sec_wrap_filter': 'on', + 'analyst_wrap_filter': 'on', + 'analyst_recommendations': 'on', + 'upgrade_filter': 'on', + 'downgrade_filter': 'on', + 'initiate_filter': 'on', + 'no_change_filter': 'on', + 'events': 'on', + 'symbol': keyword, + } + res = requests.get(url = url, headers= headers, params = params, verify=False) + if res.status_code != 200: + print(f'Connection Error: {res.status_code}') + return f'Connection Error: {res.status_code}' + + res = etree.HTML(res.text) + tables = res.xpath("/html/body/div[2]/div/div/div[1]/table")[1:] + titles = [] + stocks = [] + abstracts = [] + dates = [] + times = [] + for table in tables: + trs = table.xpath("./tr") + for tr in trs: + title = tr.xpath("./td[2]/div[1]/a/span//text()") + if len(title) > 0: + titles.append(' '.join(title)) + stocks.append(' '.join(tr.xpath("./td[2]/div[1]/div/span/text()"))) + abstracts.append(' '.join(tr.xpath("./td[2]/div[2]/dd/p[1]/text()"))) + dates.append(' '.join(tr.xpath("./td[2]/div[1]/span[2]/small/span[3]/text()"))) + times.append(' '.join(tr.xpath("./td[2]/div[1]/span[2]/small/span[3]/div/text()"))) + + tmp = pd.DataFrame([titles, stocks, abstracts, dates, times]).T + tmp.columns = ["title", "stock", "abstract", "date", "time"] + self.dataframe = pd.concat([self.dataframe, tmp]) + + time.sleep(delay) diff --git a/FinNLP/finnlp/data_sources/news/tipranks_streaming.py b/FinNLP/finnlp/data_sources/news/tipranks_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..f73a544d10a54c8f411225bb1dd762ac8705d424 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/tipranks_streaming.py @@ -0,0 +1,41 @@ +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents + +class TipRanks_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "apple", rounds = 10000, delay = 0.5): + url = "https://www.tipranks.com/api/news/posts" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + } + print("Downloading:", end = " ") + for r in range(rounds): + params = { + 'page': r, + 'per_page': '50', + 'search': keyword, + } + res = requests.get(url = url, headers= headers, params=params) + if res.status_code != 200: + break + try: + res = json.loads(res.text) + tmp = pd.DataFrame(res['data']) + self.dataframe = pd.concat([self.dataframe, tmp]) + except: + print(res.text) + # sleep + time.sleep(delay) + print(r, end = " ") \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/news/tushare_major_news.py b/FinNLP/finnlp/data_sources/news/tushare_major_news.py new file mode 100644 index 0000000000000000000000000000000000000000..ca3daf7e731b10a8c94c8e87df024488b8e345b8 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/tushare_major_news.py @@ -0,0 +1,32 @@ +import tushare as ts +import pandas as pd +from tqdm.notebook import tqdm +from finnlp.data_sources.news._base import News_Downloader +import time + +class Tushare_Major_News(News_Downloader): + + def __init__(self, args = {}): + token = args["token"] if "token" in args.keys() else "27080ec403c0218f96f388bca1b1d85329d563c91a43672239619ef5" + ts.set_token(token) + self.pro = ts.pro_api() + + def download_news(self, start_date, end_date, stock = "all"): + self.date_list = pd.date_range(start_date,end_date) + res = pd.DataFrame() + for date in tqdm(self.date_list): + tmp = self.gather_one_day_news(date) + res = pd.concat([res,tmp]) + self.dataframe = res + + def gather_one_day_news(self,date,stock = "all",delay = 0.1): + date = self.transfer_standard_date_to_nonstandard(date) + res = self.pro.major_news(start_date = date,end_date = date) + time.sleep(delay) + return res + + def clean_data(self): + pass + + def transfer_standard_date_to_nonstandard(self,date): + return date.strftime("%Y-%m0%d 00:00:00") \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/news/yahoo_streaming.py b/FinNLP/finnlp/data_sources/news/yahoo_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..691b1faa4063e5541165132118e3cb7e2c69a0a0 --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/yahoo_streaming.py @@ -0,0 +1,222 @@ +import warnings +warnings.filterwarnings("ignore") + +from finnlp.data_sources.news._base import News_Downloader + +from tqdm import tqdm +from lxml import etree +import pandas as pd +import requests +import finnhub +import time +import json + +class Yahoo_Date_Range(News_Downloader): + def __init__(self, args = {}): + super().__init__(args) + assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard" + self.finnhub_client = finnhub.Client(api_key=args["token"]) + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): + self.date_list = pd.date_range(start_date,end_date) + self.dataframe = pd.DataFrame() + + days_each_time = 4 + date_list = self.date_list + # cal total lenth + if len(date_list)%days_each_time == 0: + total = len(date_list)//days_each_time + else: + total = len(date_list)//days_each_time+1 + + with tqdm(total=total, desc= "Downloading Titles") as bar: + while len(date_list): + tmp_date_list = date_list[:days_each_time] + date_list = date_list[days_each_time:] + tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d") + tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d") + res = self._gather_one_part(tmp_start_date,tmp_end_date,stock = stock ) + self.dataframe = pd.concat([self.dataframe,res]) + bar.update(1) + + # res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date) + self.dataframe.datetime = pd.to_datetime(self.dataframe.datetime,unit = "s") + self.dataframe = self.dataframe.reset_index(drop = True) + + def _gather_one_part(self, start_date, end_date, stock = "AAPL", delay = 1): + res = self.finnhub_client.company_news(stock, _from=start_date, to=end_date) + time.sleep(delay) + return pd.DataFrame(res) + + def gather_content(self, delay = 0.01): + pbar = tqdm(total = self.dataframe.shape[0], desc= "Gathering news contents") + self.dataframe["content"] = self.dataframe.apply(lambda x:self._gather_content_apply(x, pbar, delay), axis = 1) + + def _gather_content_apply(self,x, pbar, delay = 0.01): + time.sleep(delay) + url = x.url + source = x.source + response = self._request_get(url = url) + # response = self._request_get(url= url, headers= headers) + pbar.update(1) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + try: + # Yahoo Finance + if source == "Yahoo": + page = page.xpath("/html/body/div[3]/div[1]/div/main/div[1]/div/div/div/div/article/div/div/div/div/div/div[2]/div[4]") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # Reuters + elif source == "Reuters": + page = page.xpath("/html/body/div[1]/div[3]/div/main/article/div[1]/div[2]/div/div/div[2]") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # SeekingAlpha + elif source == "SeekingAlpha": + page = page.xpath("/html/body/div[2]/div/div[1]/main/div/div[2]/div/article/div/div/div[2]/div/section[1]/div/div/div") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # PennyStocks + elif source == "PennyStocks": + page = page.xpath("/html/body/div[3]/div/div[1]/div/div/div/main/article/div[2]/div[2]/div") + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # MarketWatch + elif source == "MarketWatch": + page = page.xpath('//*[@id="js-article__body"]') + content = page[0].xpath(".//text()") + content = "".join(content) + while " " in content: + content = content.replace(" ", " ") + while "\n \n"in content: + content = content.replace("\n \n", " ") + while "\n "in content: + content = content.replace("\n ", " ") + return content + + # Seeking Alpha + elif source == "Seeking Alpha": + # first get Seeking Alpha URL + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + content = page[0].xpath(".//text()") + content = "\n".join(content) + return content + + # Alliance News + elif source == "Alliance News": + page = page.xpath('//*[@id="comtext"]') + content = page[0].xpath(".//text()") + content = [c for c in content if not str(c).startswith("\r\n")] + content = "\n".join(content) + return content + + # Thefly.com + elif source == "Thefly.com": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new, verify= False) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + page = page.xpath('/html/body/div[2]/div/div/div/div/div[2]/div[2]//text()') + # content = page[0].xpath(".//text()") + # content = [c for c in content if not str(c).startswith("\r\n")] + content = "\n".join(page) + content = content.replace("\r\n","") + + return content + + # TalkMarkets + elif source == "TalkMarkets": + return "Not supported yet" + + # CNBC + elif source == "CNBC": + page = page.xpath('/html/body/div[3]/div/div[1]/div[3]/div/div/div/div[3]/div[1]/div[2]/div[3]//text()') + content = "\n".join(page) + + return content + + # GuruFocus + elif source == "GuruFocus": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + + page = page.xpath('/html/body/div[1]/div/section/section/main/section/main/div[1]/div/div/div[1]/div[2]/div//text()') + page_new = [] + for c in page: + while "\n" in c: + c = c.replace("\n","") + while " "in c: + c = c.replace(" ","") + + page_new.append(c) + + content = "\n".join(page_new) + + return content + + # InvestorPlace + elif source == "InvestorPlace": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + page = page.xpath('//script[@type="application/ld+json"]')[1] + content = page.xpath(".//text()") + content = json.loads(content[0]) + content = content["articleBody"] + + return content + + # TipRanks + elif source == "TipRanks": + page = page.xpath('/html/body/div[5]/div[2]/section[1]/article[2]/div/div[2]/p/a/@href') + url_new = page[0] + response = self._request_get(url= url_new) + if response is None: + return "Connection Error" + else: + page = etree.HTML(response.text) + # /html/body/div[1]/div[2]/div[5]/div[2]/div[2]/div/div[6]/div/article/p[1]/p + page = page.xpath('/html/body/div[1]/div[1]/div[4]/div[2]/div[2]/div[1]/div[6]//text()') + # content = page[0].xpath('.//text()') + page = [p.replace("\n","") for p in page] + content = "".join(page) + return content + + else: + return "Not supported yet" + + except: + return "Error" + diff --git a/FinNLP/finnlp/data_sources/news/yicai_streaming.py b/FinNLP/finnlp/data_sources/news/yicai_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..352b44162a31391ba53e04a25776b16a0e32240c --- /dev/null +++ b/FinNLP/finnlp/data_sources/news/yicai_streaming.py @@ -0,0 +1,47 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.news._base import News_Downloader + +# TODO: +# 1. Contents + +class Yicai_Streaming(News_Downloader): + + def __init__(self, args={}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_search(self, keyword = "茅台", rounds = 3, delay = 0.5): + url = "https://www.yicai.com/api/ajax/getSearchResult" + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + 'Referer':'https://www.yicai.com/search?keys=%E8%8C%85%E5%8F%B0', + 'X-Requested-With': 'XMLHttpRequest', + } + + print("Downloading ...", end = ' ') + for page in range(rounds): + params = { + 'page': page, + 'pagesize': '20', + 'keys': keyword, + 'type': '0', + } + res = requests.get(url = url, headers = headers, params = params) + if res.status_code != 200: + break + res = json.loads(res.text) + res = res['results'] + tmp = pd.DataFrame(res["docs"]) + self.dataframe = pd.concat([self.dataframe, tmp]) + + print(page, end = ' ') + + time.sleep(delay) \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/sec_filings/README.md b/FinNLP/finnlp/data_sources/sec_filings/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2dd4fee4fb0e4b0f1649a74ab0514aa89d71ee6e --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/README.md @@ -0,0 +1,25 @@ +# SEC DATA DOWNLOADER + +Please checkout this repo that I am building on SEC Question Answering Agent [SEC-QA](https://github.com/Athe-kunal/SEC-QA-Agent) + +This repository downloads all the texts from SEC documents (10-K and 10-Q). Currently, it is not supporting documents that are amended, but that will be added in the near futures. + +Install the required dependencies + +``` +python install -r requirements.txt +``` + +The SEC Downloader expects 5 attributes + +* tickers: It is a list of valid tickers +* amount: Number of documents that you want to download +* filing_type: 10-K or 10-Q filing type +* num_workers: It is for multithreading and multiprocessing. We have multi-threading at the ticker level and multi-processing at the year level for a given ticker +* include_amends: To include amendments or not. + + +## REFERENCES +1. Unstructured SEC Filings API: [repo link](https://github.com/Unstructured-IO/pipeline-sec-filings/tree/main) +2. SEC Edgar Downloader: [repo link](https://github.com/jadchaar/sec-edgar-downloader) + diff --git a/FinNLP/finnlp/data_sources/sec_filings/__init__.py b/FinNLP/finnlp/data_sources/sec_filings/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba14c2b0709039f571b151a299eebfbf613ebffe --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/__init__.py @@ -0,0 +1,4 @@ +from finnlp.data_sources.sec_filings.main import SECFilingsLoader + + + diff --git a/FinNLP/finnlp/data_sources/sec_filings/main.py b/FinNLP/finnlp/data_sources/sec_filings/main.py new file mode 100644 index 0000000000000000000000000000000000000000..e71429c49e251066cb4b16648ac104b489ddfcd0 --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/main.py @@ -0,0 +1,99 @@ +from finnlp.data_sources.sec_filings.sec_filings import SECExtractor +import concurrent.futures +import json +import os +import time +from collections import defaultdict +from typing import List + +class SECFilingsLoader(): + """ + SEC Filings loader + Get the SEC filings of multiple tickers + """ + + def __init__( + self, + tickers: List[str], + amount: int, + filing_type: str = "10-K", + num_workers: int = 2, + include_amends: bool = False, + folder_name:str = "data" + ): + assert filing_type in [ + "10-K", + "10-Q", + ], "The supported document types are 10-K and 10-Q" + + self.tickers = tickers + self.amount = amount + self.filing_type = filing_type + self.num_workers = num_workers + self.include_amends = include_amends + + self.se = SECExtractor( + tickers, amount, filing_type, include_amends=include_amends + ) + self.folder_name = folder_name + os.makedirs(self.folder_name, exist_ok=True) + + def multiprocess_run(self, tic): + tic_dict = self.se.get_accession_numbers(tic) + text_dict = defaultdict(list) + for tic, fields in tic_dict.items(): + os.makedirs(f"{self.folder_name}/{tic}", exist_ok=True) + print(f"Started for {tic}") + + field_urls = [field["url"] for field in fields] + years = [field["year"] for field in fields] + with concurrent.futures.ProcessPoolExecutor( + max_workers=self.num_workers + ) as executor: + results = executor.map(self.se.get_text_from_url, field_urls) + for idx, res in enumerate(results): + all_text, filing_type = res + text_dict[tic].append( + { + "year": years[idx], + "ticker": tic, + "all_texts": all_text, + "filing_type": filing_type, + } + ) + return text_dict + + def load_data(self): + start = time.time() + thread_workers = min(len(self.tickers), self.num_workers) + with concurrent.futures.ThreadPoolExecutor( + max_workers=thread_workers + ) as executor: + results = executor.map(self.multiprocess_run, self.tickers) + + for res in results: + curr_tic = list(res.keys())[0] + for data in res[curr_tic]: + curr_year = data["year"] + curr_filing_type = data["filing_type"] + if curr_filing_type in ["10-K/A", "10-Q/A"]: + curr_filing_type = curr_filing_type.replace("/", "") + if curr_filing_type in ["10-K", "10-KA"]: + os.makedirs(f"{self.folder_name}/{curr_tic}/{curr_year}", exist_ok=True) + with open( + f"{self.folder_name}/{curr_tic}/{curr_year}/{curr_filing_type}.json", "w" + ) as f: + json.dump(data, f, indent=4) + elif curr_filing_type in ["10-Q", "10-QA"]: + os.makedirs(f"{self.folder_name}/{curr_tic}/{curr_year[:-2]}", exist_ok=True) + with open( + f"{self.folder_name}/{curr_tic}/{curr_year[:-2]}/{curr_filing_type}_{curr_year[-2:]}.json", + "w", + ) as f: + json.dump(data, f, indent=4) + print( + f"Done for {curr_tic} for document {curr_filing_type} and year" + f" {curr_year}" + ) + + print(f"It took {round(time.time()-start,2)} seconds") diff --git a/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/__init__.py b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/app.py b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/app.py new file mode 100644 index 0000000000000000000000000000000000000000..587257313c4871f468f2266a68583f4eedfc9484 --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/app.py @@ -0,0 +1,47 @@ +##################################################################### +# THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. +# DO NOT MODIFY DIRECTLY +##################################################################### + + +import logging +import os + +from fastapi import FastAPI, Request, status + +from .section import router as section_router + +app = FastAPI( + title="Unstructured Pipeline API", + description="""""", + version="1.0.0", + docs_url="/sec-filings/docs", + openapi_url="/sec-filings/openapi.json", +) + +allowed_origins = os.environ.get("ALLOWED_ORIGINS", None) +if allowed_origins: + from fastapi.middleware.cors import CORSMiddleware + + app.add_middleware( + CORSMiddleware, + allow_origins=allowed_origins.split(","), + allow_methods=["OPTIONS", "POST"], + allow_headers=["Content-Type"], + ) + +app.include_router(section_router) + + +# Filter out /healthcheck noise +class HealthCheckFilter(logging.Filter): + def filter(self, record: logging.LogRecord) -> bool: + return record.getMessage().find("/healthcheck") == -1 + + +logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter()) + + +@app.get("/healthcheck", status_code=status.HTTP_200_OK, include_in_schema=False) +def healthcheck(request: Request): + return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} diff --git a/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/section.py b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/section.py new file mode 100644 index 0000000000000000000000000000000000000000..2801260db1f293a41551f21df6c804ae47ad1c5f --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/api/section.py @@ -0,0 +1,404 @@ +##################################################################### +# THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. +# DO NOT MODIFY DIRECTLY +##################################################################### + +import gzip +import io +import json +import mimetypes +import os +import secrets +from base64 import b64encode +from typing import List, Mapping, Optional, Union + +from fastapi import ( + APIRouter, + FastAPI, + File, + Form, + HTTPException, + Request, + UploadFile, + status, +) +from fastapi.responses import StreamingResponse +from starlette.datastructures import Headers +from starlette.types import Send + +from finnlp.data_sources.sec_filings.prepline_sec_filings.sec_document import ( + REPORT_TYPES, + VALID_FILING_TYPES, + SECDocument, +) +from finnlp.data_sources.sec_filings.prepline_sec_filings.sections import ( + ALL_SECTIONS, + SECTIONS_10K, + SECTIONS_10Q, + SECTIONS_S1, + section_string_to_enum, + validate_section_names, +) + + +import csv +import re +import signal +from enum import Enum +from typing import Dict + +from unstructured.staging.base import convert_to_isd +from unstructured.staging.label_studio import stage_for_label_studio + +app = FastAPI() +router = APIRouter() + + +def is_expected_response_type(media_type, response_type): + if media_type == "application/json" and response_type not in [dict, list]: + return True + elif media_type == "text/csv" and response_type != str: + return True + else: + return False + + +# pipeline-api + + +class timeout: + def __init__(self, seconds=1, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + try: + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + except ValueError: + pass + + def __exit__(self, type, value, traceback): + try: + signal.alarm(0) + except ValueError: + pass + + +def get_regex_enum(section_regex): + class CustomSECSection(Enum): + CUSTOM = re.compile(section_regex) + + @property + def pattern(self): + return self.value + + return CustomSECSection.CUSTOM + + +def convert_to_isd_csv(results: dict) -> str: + """ + Returns the representation of document elements as an Initial Structured Document (ISD) + in CSV Format. + """ + csv_fieldnames: List[str] = ["section", "element_type", "text"] + new_rows = [] + for section, section_narrative in results.items(): + rows: List[Dict[str, str]] = convert_to_isd(section_narrative) + for row in rows: + new_row_item = dict() + new_row_item["section"] = section + new_row_item["element_type"] = row["type"] + new_row_item["text"] = row["text"] + new_rows.append(new_row_item) + + with io.StringIO() as buffer: + csv_writer = csv.DictWriter(buffer, fieldnames=csv_fieldnames) + csv_writer.writeheader() + csv_writer.writerows(new_rows) + return buffer.getvalue() + + +# List of valid response schemas +LABELSTUDIO = "labelstudio" +ISD = "isd" + + +def pipeline_api( + text, + response_type="application/json", + response_schema="isd", + m_section=[], + m_section_regex=[], +): + """Many supported sections including: RISK_FACTORS, MANAGEMENT_DISCUSSION, and many more""" + validate_section_names(m_section) + + sec_document = SECDocument.from_string(text) + if sec_document.filing_type not in VALID_FILING_TYPES: + raise ValueError( + f"SEC document filing type {sec_document.filing_type} is not supported, " + f"must be one of {','.join(VALID_FILING_TYPES)}" + ) + results = {} + if m_section == [ALL_SECTIONS]: + filing_type = sec_document.filing_type + if filing_type in REPORT_TYPES: + if filing_type.startswith("10-K"): + m_section = [enum.name for enum in SECTIONS_10K] + elif filing_type.startswith("10-Q"): + m_section = [enum.name for enum in SECTIONS_10Q] + else: + raise ValueError(f"Invalid report type: {filing_type}") + + else: + m_section = [enum.name for enum in SECTIONS_S1] + for section in m_section: + results[section] = sec_document.get_section_narrative( + section_string_to_enum[section] + ) + for i, section_regex in enumerate(m_section_regex): + regex_enum = get_regex_enum(section_regex) + with timeout(seconds=5): + section_elements = sec_document.get_section_narrative(regex_enum) + results[f"REGEX_{i}"] = section_elements + if response_type == "application/json": + if response_schema == LABELSTUDIO: + return { + section: stage_for_label_studio(section_narrative) + for section, section_narrative in results.items() + } + elif response_schema == ISD: + return { + section: convert_to_isd(section_narrative) + for section, section_narrative in results.items() + } + else: + raise ValueError( + f"output_schema '{response_schema}' is not supported for" + f" {response_type}" + ) + elif response_type == "text/csv": + if response_schema != ISD: + raise ValueError( + f"output_schema '{response_schema}' is not supported for" + f" {response_type}" + ) + return convert_to_isd_csv(results) + else: + raise ValueError(f"response_type '{response_type}' is not supported") + + +def get_validated_mimetype(file): + """ + Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too + generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and + return HTTP 400 for an invalid type. + """ + content_type = file.content_type + if not content_type or content_type == "application/octet-stream": + content_type = mimetypes.guess_type(str(file.filename))[0] + + # Some filetypes missing for this library, just hardcode them for now + if not content_type: + if file.filename.endswith(".md"): + content_type = "text/markdown" + elif file.filename.endswith(".msg"): + content_type = "message/rfc822" + + allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES") + if allowed_mimetypes_str is not None: + allowed_mimetypes = allowed_mimetypes_str.split(",") + + if content_type not in allowed_mimetypes: + raise HTTPException( + status_code=400, + detail=( + f"Unable to process {file.filename}: " + f"File type {content_type} is not supported." + ), + ) + + return content_type + + +class MultipartMixedResponse(StreamingResponse): + CRLF = b"\r\n" + + def __init__(self, *args, content_type: str = None, **kwargs): + super().__init__(*args, **kwargs) + self.content_type = content_type + + def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: + super().init_headers(headers) + self.boundary_value = secrets.token_hex(16) + content_type = f'multipart/mixed; boundary="{self.boundary_value}"' + self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) + + @property + def boundary(self): + return b"--" + self.boundary_value.encode() + + def _build_part_headers(self, headers: dict) -> bytes: + header_bytes = b"" + for header, value in headers.items(): + header_bytes += f"{header}: {value}".encode() + self.CRLF + return header_bytes + + def build_part(self, chunk: bytes) -> bytes: + part = self.boundary + self.CRLF + part_headers = { + "Content-Length": len(chunk), + "Content-Transfer-Encoding": "base64", + } + if self.content_type is not None: + part_headers["Content-Type"] = self.content_type + part += self._build_part_headers(part_headers) + part += self.CRLF + chunk + self.CRLF + return part + + async def stream_response(self, send: Send) -> None: + await send( + { + "type": "http.response.start", + "status": self.status_code, + "headers": self.raw_headers, + } + ) + async for chunk in self.body_iterator: + if not isinstance(chunk, bytes): + chunk = chunk.encode(self.charset) + chunk = b64encode(chunk) + await send( + { + "type": "http.response.body", + "body": self.build_part(chunk), + "more_body": True, + } + ) + + await send({"type": "http.response.body", "body": b"", "more_body": False}) + + +def ungz_file(file: UploadFile, gz_uncompressed_content_type=None) -> UploadFile: + def return_content_type(filename): + if gz_uncompressed_content_type: + return gz_uncompressed_content_type + else: + return str(mimetypes.guess_type(filename)[0]) + + filename = str(file.filename) if file.filename else "" + if filename.endswith(".gz"): + filename = filename[:-3] + + gzip_file = gzip.open(file.file).read() + return UploadFile( + file=io.BytesIO(gzip_file), + size=len(gzip_file), + filename=filename, + headers=Headers({"content-type": return_content_type(filename)}), + ) + + +@router.post("/sec-filings/v0/section") +@router.post("/sec-filings/v0.2.1/section") +def pipeline_1( + request: Request, + gz_uncompressed_content_type: Optional[str] = Form(default=None), + text_files: Union[List[UploadFile], None] = File(default=None), + output_format: Union[str, None] = Form(default=None), + output_schema: str = Form(default=None), + section: List[str] = Form(default=[]), + section_regex: List[str] = Form(default=[]), +): + if text_files: + for file_index in range(len(text_files)): + if text_files[file_index].content_type == "application/gzip": + text_files[file_index] = ungz_file(text_files[file_index]) + + content_type = request.headers.get("Accept") + + default_response_type = output_format or "application/json" + if not content_type or content_type == "*/*" or content_type == "multipart/mixed": + media_type = default_response_type + else: + media_type = content_type + + default_response_schema = output_schema or "isd" + + if isinstance(text_files, list) and len(text_files): + if len(text_files) > 1: + if content_type and content_type not in [ + "*/*", + "multipart/mixed", + "application/json", + ]: + raise HTTPException( + detail=( + f"Conflict in media type {content_type}" + ' with response type "multipart/mixed".\n' + ), + status_code=status.HTTP_406_NOT_ACCEPTABLE, + ) + + def response_generator(is_multipart): + for file in text_files: + get_validated_mimetype(file) + + text = file.file.read().decode("utf-8") + + response = pipeline_api( + text, + m_section=section, + m_section_regex=section_regex, + response_type=media_type, + response_schema=default_response_schema, + ) + + if is_expected_response_type(media_type, type(response)): + raise HTTPException( + detail=( + f"Conflict in media type {media_type}" + f" with response type {type(response)}.\n" + ), + status_code=status.HTTP_406_NOT_ACCEPTABLE, + ) + + valid_response_types = [ + "application/json", + "text/csv", + "*/*", + "multipart/mixed", + ] + if media_type in valid_response_types: + if is_multipart: + if type(response) not in [str, bytes]: + response = json.dumps(response) + yield response + else: + raise HTTPException( + detail=f"Unsupported media type {media_type}.\n", + status_code=status.HTTP_406_NOT_ACCEPTABLE, + ) + + if content_type == "multipart/mixed": + return MultipartMixedResponse( + response_generator(is_multipart=True), content_type=media_type + ) + else: + return ( + list(response_generator(is_multipart=False))[0] + if len(text_files) == 1 + else response_generator(is_multipart=False) + ) + else: + raise HTTPException( + detail='Request parameter "text_files" is required.\n', + status_code=status.HTTP_400_BAD_REQUEST, + ) + + +app.include_router(router) diff --git a/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/fetch.py b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/fetch.py new file mode 100644 index 0000000000000000000000000000000000000000..83477bc210584ee1ccf3b8deff35eb5c1965e693 --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/fetch.py @@ -0,0 +1,251 @@ +"""Module for fetching data from the SEC EDGAR Archives""" +import json +import os +import re +import sys +from typing import List, Optional, Tuple, Union + +import requests + +if sys.version_info < (3, 8): + from typing_extensions import Final +else: + from typing import Final + +import webbrowser + +try: + from ratelimit import limits, sleep_and_retry +except ImportError: + + def fake_decorator(*args, **kwargs): + def inner(func): + return func + + return inner + + limits = fake_decorator + sleep_and_retry = fake_decorator + +from finnlp.data_sources.sec_filings.prepline_sec_filings.sec_document import ( + VALID_FILING_TYPES, +) + + +SEC_ARCHIVE_URL: Final[str] = "https://www.sec.gov/Archives/edgar/data" +SEC_SEARCH_URL: Final[str] = "http://www.sec.gov/cgi-bin/browse-edgar" +SEC_SUBMISSIONS_URL = "https://data.sec.gov/submissions" + + +def get_filing( + cik: Union[str, int], accession_number: Union[str, int], company: str, email: str +) -> str: + """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate + limits specified on the SEC website. + ref: https://www.sec.gov/os/accessing-edgar-data""" + session = _get_session(company, email) + return _get_filing(session, cik, accession_number) + + +@sleep_and_retry +@limits(calls=10, period=1) +def _get_filing( + session: requests.Session, cik: Union[str, int], accession_number: Union[str, int] +) -> str: + """Wrapped so filings can be retrieved with an existing session.""" + url = archive_url(cik, accession_number) + response = session.get(url) + response.raise_for_status() + return response.text + + +@sleep_and_retry +@limits(calls=10, period=1) +def get_cik_by_ticker(session: requests.Session, ticker: str) -> str: + """Gets a CIK number from a stock ticker by running a search on the SEC website.""" + cik_re = re.compile(r".*CIK=(\d{10}).*") + url = _search_url(ticker) + response = session.get(url, stream=True) + response.raise_for_status() + results = cik_re.findall(response.text) + return str(results[0]) + + +@sleep_and_retry +@limits(calls=10, period=1) +def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict: + """Gets retrieves dict of recent SEC form filings for a given cik number.""" + json_name = f"CIK{cik}.json" + response = session.get(f"{SEC_SUBMISSIONS_URL}/{json_name}") + response.raise_for_status() + content = json.loads(response.content) + recent_forms = content["filings"]["recent"] + form_types = { + k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"]) + } + return form_types + + +def _get_recent_acc_num_by_cik( + session: requests.Session, cik: Union[str, int], form_types: List[str] +) -> Tuple[str, str]: + """Returns accession number and form type for the most recent filing for one of the + given form_types (AKA filing types) for a given cik.""" + retrieved_form_types = get_forms_by_cik(session, cik) + for acc_num, form_type_ in retrieved_form_types.items(): + if form_type_ in form_types: + return _drop_dashes(acc_num), form_type_ + raise ValueError(f"No filings found for {cik}, looking for any of: {form_types}") + + +def get_recent_acc_by_cik( + cik: str, + form_type: str, + company: Optional[str] = None, + email: Optional[str] = None, +) -> Tuple[str, str]: + """Returns (accession_number, retrieved_form_type) for the given cik and form_type. + The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q. + """ + session = _get_session(company, email) + return _get_recent_acc_num_by_cik(session, cik, _form_types(form_type)) + + +def get_recent_cik_and_acc_by_ticker( + ticker: str, + form_type: str, + company: Optional[str] = None, + email: Optional[str] = None, +) -> Tuple[str, str, str]: + """Returns (cik, accession_number, retrieved_form_type) for the given ticker and form_type. + The retrieved_form_type may be an amended version of requested form_type, e.g. 10-Q/A for 10-Q. + """ + session = _get_session(company, email) + cik = get_cik_by_ticker(session, ticker) + acc_num, retrieved_form_type = _get_recent_acc_num_by_cik( + session, cik, _form_types(form_type) + ) + return cik, acc_num, retrieved_form_type + + +def get_form_by_ticker( + ticker: str, + form_type: str, + allow_amended_filing: Optional[bool] = True, + company: Optional[str] = None, + email: Optional[str] = None, +) -> str: + """For a given ticker, gets the most recent form of a given form_type.""" + session = _get_session(company, email) + cik = get_cik_by_ticker(session, ticker) + return get_form_by_cik( + cik, + form_type, + allow_amended_filing=allow_amended_filing, + company=company, + email=email, + ) + + +def _form_types(form_type: str, allow_amended_filing: Optional[bool] = True): + """Potentialy expand to include amended filing, e.g.: + "10-Q" -> "10-Q/A" + """ + assert form_type in VALID_FILING_TYPES + if allow_amended_filing and not form_type.endswith("/A"): + return [form_type, f"{form_type}/A"] + else: + return [form_type] + + +def get_form_by_cik( + cik: str, + form_type: str, + allow_amended_filing: Optional[bool] = True, + company: Optional[str] = None, + email: Optional[str] = None, +) -> str: + """For a given CIK, returns the most recent form of a given form_type. By default + an amended version of the form_type may be retrieved (allow_amended_filing=True). + E.g., if form_type is "10-Q", the retrived form could be a 10-Q or 10-Q/A. + """ + session = _get_session(company, email) + acc_num, _ = _get_recent_acc_num_by_cik( + session, cik, _form_types(form_type, allow_amended_filing) + ) + text = _get_filing(session, cik, acc_num) + return text + + +def open_form(cik, acc_num): + """For a given cik and accession number, opens the index page in default browser for the + associated SEC form""" + acc_num = _drop_dashes(acc_num) + webbrowser.open_new_tab( + f"{SEC_ARCHIVE_URL}/{cik}/{acc_num}/{_add_dashes(acc_num)}-index.html" + ) + + +def open_form_by_ticker( + ticker: str, + form_type: str, + allow_amended_filing: Optional[bool] = True, + company: Optional[str] = None, + email: Optional[str] = None, +): + """For a given ticker, opens the index page in default browser for the most recent form of a + given form_type.""" + session = _get_session(company, email) + cik = get_cik_by_ticker(session, ticker) + acc_num, _ = _get_recent_acc_num_by_cik( + session, cik, _form_types(form_type, allow_amended_filing) + ) + open_form(cik, acc_num) + + +def archive_url(cik: Union[str, int], accession_number: Union[str, int]) -> str: + """Builds the archive URL for the SEC accession number. Looks for the .txt file for the + filing, while follows a {accession_number}.txt format.""" + filename = f"{_add_dashes(accession_number)}.txt" + accession_number = _drop_dashes(accession_number) + return f"{SEC_ARCHIVE_URL}/{cik}/{accession_number}/{filename}" + + +def _search_url(cik: Union[str, int]) -> str: + search_string = f"CIK={cik}&Find=Search&owner=exclude&action=getcompany" + url = f"{SEC_SEARCH_URL}?{search_string}" + return url + + +def _add_dashes(accession_number: Union[str, int]) -> str: + """Adds the dashes back into the accession number""" + accession_number = str(accession_number) + return f"{accession_number[:10]}-{accession_number[10:12]}-{accession_number[12:]}" + + +def _drop_dashes(accession_number: Union[str, int]) -> str: + """Converts the accession number to the no dash representation.""" + accession_number = str(accession_number).replace("-", "") + return accession_number.zfill(18) + + +def _get_session( + company: Optional[str] = None, email: Optional[str] = None +) -> requests.Session: + """Creates a requests sessions with the appropriate headers set. If these headers are not + set, SEC will reject your request. + ref: https://www.sec.gov/os/accessing-edgar-data""" + if company is None: + company = os.environ.get("SEC_API_ORGANIZATION") + if email is None: + email = os.environ.get("SEC_API_EMAIL") + assert company + assert email + session = requests.Session() + session.headers.update( + { + "User-Agent": f"{company} {email}", + "Content-Type": "text/html", + } + ) + return session diff --git a/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/sec_document.py b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/sec_document.py new file mode 100644 index 0000000000000000000000000000000000000000..16d1f3478b5b0f169ffb71ff5700e8fa623583b2 --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/sec_document.py @@ -0,0 +1,450 @@ +import re +import sys +from functools import partial +from typing import Any, Iterable, Iterator, List, Optional, Tuple + +if sys.version_info < (3, 8): + from typing_extensions import Final +else: + from typing import Final + +from collections import defaultdict + +import numpy as np +import numpy.typing as npt + + +from sklearn.cluster import DBSCAN +from unstructured.cleaners.core import clean +from unstructured.documents.elements import ( + Element, + ListItem, + NarrativeText, + Text, + Title, +) +from unstructured.documents.html import HTMLDocument +from unstructured.nlp.partition import is_possible_title + +# NOTE(yuming): clean_sec_text is a partial cleaner from clean, +# and is used for cleaning a section of text from a SEC filing. +clean_sec_text = partial( + clean, extra_whitespace=True, dashes=True, trailing_punctuation=True +) + +from finnlp.data_sources.sec_filings.prepline_sec_filings.sections import SECSection + + + +VALID_FILING_TYPES: Final[List[str]] = [ + "10-K", + "10-Q", + "S-1", + "10-K/A", + "10-Q/A", + "S-1/A", +] +REPORT_TYPES: Final[List[str]] = ["10-K", "10-Q", "10-K/A", "10-Q/A"] +S1_TYPES: Final[List[str]] = ["S-1", "S-1/A"] + +ITEM_TITLE_RE = re.compile(r"(?i)item \d{1,3}(?:[a-z]|\([a-z]\))?(?:\.)?(?::)?") + + +def _raise_for_invalid_filing_type(filing_type: Optional[str]): + if not filing_type: + raise ValueError("Filing type is empty.") + elif filing_type not in VALID_FILING_TYPES: + raise ValueError( + f"Filing type was {filing_type}. Expected: {VALID_FILING_TYPES}" + ) + + +class SECDocument(HTMLDocument): + filing_type = None + + def _filter_table_of_contents(self, elements: List[Text]) -> List[Text]: + """Filter out unnecessary elements in the table of contents using keyword search.""" + if self.filing_type in REPORT_TYPES: + # NOTE(yuming): Narrow TOC as all elements within + # the first two titles that contain the keyword 'part i\b'. + start, end = None, None + for i, element in enumerate(elements): + if bool(re.match(r"(?i)part i\b", clean_sec_text(element.text))): + if start is None: + # NOTE(yuming): Found the start of the TOC section. + start = i + else: + # NOTE(yuming): Found the end of the TOC section. + end = i - 1 + filtered_elements = elements[start:end] + return filtered_elements + elif self.filing_type in S1_TYPES: + # NOTE(yuming): Narrow TOC as all elements within + # the first pair of duplicated titles that contain the keyword 'prospectus'. + title_indices = defaultdict(list) + for i, element in enumerate(elements): + clean_title_text = clean_sec_text(element.text).lower() + title_indices[clean_title_text].append(i) + duplicate_title_indices = { + k: v for k, v in title_indices.items() if len(v) > 1 + } + for title, indices in duplicate_title_indices.items(): + # NOTE(yuming): Make sure that we find the pair of duplicated titles. + if "prospectus" in title and len(indices) == 2: + start = indices[0] + end = indices[1] - 1 + filtered_elements = elements[start:end] + return filtered_elements + # NOTE(yuming): Probably better ways to improve TOC, + # but now we return [] if it fails to find the keyword. + return [] + + def get_table_of_contents(self) -> HTMLDocument: + """Identifies text sections that are likely the table of contents.""" + out_cls = self.__class__ + _raise_for_invalid_filing_type(self.filing_type) + title_locs = to_sklearn_format(self.elements) + if len(title_locs) == 0: + return out_cls.from_elements([]) + # NOTE(alan): Might be a way to do the same thing that doesn't involve the transformations + # necessary to get it into sklearn. We're just looking for densely packed Titles. + res = DBSCAN(eps=6.0).fit_predict(title_locs) + for i in range(res.max() + 1): + idxs = cluster_num_to_indices(i, title_locs, res) + cluster_elements: List[Text] = [self.elements[i] for i in idxs] + if any( + [ + # TODO(alan): Maybe swap risk title out for something more generic? It helps to + # have 2 markers though, I think. + is_risk_title(el.text, self.filing_type) + for el in cluster_elements + if isinstance(el, Title) + ] + ) and any( + [ + is_toc_title(el.text) + for el in cluster_elements + if isinstance(el, Title) + ] + ): + return out_cls.from_elements( + self._filter_table_of_contents(cluster_elements) + ) + return out_cls.from_elements(self._filter_table_of_contents(self.elements)) + + def get_section_narrative_no_toc(self, section: SECSection) -> List[NarrativeText]: + """Identifies narrative text sections that fall under the given section heading without + using the table of contents.""" + _raise_for_invalid_filing_type(self.filing_type) + # NOTE(robinson) - We are not skipping table text because the risk narrative section + # usually does not contain any tables and sometimes tables are used for + # title formating + section_elements: List[NarrativeText] = list() + in_section = False + for element in self.elements: + is_title = is_possible_title(element.text) + if in_section: + if is_title and is_item_title(element.text, self.filing_type): + if section_elements: + return section_elements + else: + in_section = False + elif isinstance(element, NarrativeText) or isinstance( + element, ListItem + ): + section_elements.append(element) + + if is_title and is_section_elem(section, element, self.filing_type): + in_section = True + + return section_elements + + def _get_toc_sections( + self, section: SECSection, toc: HTMLDocument + ) -> Tuple[Text, Text]: + """Identifies section title and next section title in TOC under the given section heading""" + # Note(yuming): The matching section and the section after the matching section + # can be thought of as placeholders to look for matching content below the toc. + section_toc = first( + el for el in toc.elements if is_section_elem(section, el, self.filing_type) + ) + if section_toc is None: + # NOTE(yuming): unable to identify the section in TOC + return (None, None) + + after_section_toc = toc.after_element(section_toc) + next_section_toc = first( + el + for el in after_section_toc.elements + if not is_section_elem(section, el, self.filing_type) + ) + if next_section_toc is None: + # NOTE(yuming): unable to identify the next section title in TOC, + # will leads to failure in finding the end of the section + return (section_toc, None) + return (section_toc, next_section_toc) + + def get_section_narrative(self, section: SECSection) -> List[NarrativeText]: + """Identifies narrative text sections that fall under the given section heading""" + _raise_for_invalid_filing_type(self.filing_type) + # NOTE(robinson) - We are not skipping table text because the risk narrative section + # usually does not contain any tables and sometimes tables are used for + # title formating + toc = self.get_table_of_contents() + if not toc.pages: + return self.get_section_narrative_no_toc(section) + + # Note(yuming): section_toc is the section title in TOC, + # next_section_toc is the section title right after section_toc in TOC + section_toc, next_section_toc = self._get_toc_sections(section, toc) + if section_toc is None: + # NOTE(yuming): fail to find the section title in TOC + return [] + + # NOTE(yuming): we use doc after next_section_toc instead of after toc + # to workaround an issue where the TOC grabbed too many elements by + # starting to parse after the section matched in the TOC + doc_after_section_toc = self.after_element( + next_section_toc if next_section_toc else section_toc + ) + # NOTE(yuming): map section_toc to the section title after TOC + # to find the start of the section + section_start_element = get_element_by_title( + reversed(doc_after_section_toc.elements), section_toc.text, self.filing_type + ) + if section_start_element is None: + return [] + doc_after_section_heading = self.after_element(section_start_element) + + # NOTE(yuming): Checks if section_toc is the last section in toc based on + # the structure of the report filings or fails to find the section title in TOC. + # returns everything up to the next Title element + # to avoid the worst case of returning the entire doc. + if self._is_last_section_in_report(section, toc) or next_section_toc is None: + # returns everything after section_start_element in doc + return get_narrative_texts(doc_after_section_heading, up_to_next_title=True) + + # NOTE(yuming): map next_section_toc to the section title after TOC + # to find the start of the next section, which is also the end of the section we want + section_end_element = get_element_by_title( + doc_after_section_heading.elements, next_section_toc.text, self.filing_type + ) + + if section_end_element is None: + # NOTE(yuming): returns everything up to the next Title element + # to avoid the worst case of returning the entire doc. + return get_narrative_texts(doc_after_section_heading, up_to_next_title=True) + + return get_narrative_texts( + doc_after_section_heading.before_element(section_end_element) + ) + + def get_risk_narrative(self) -> List[NarrativeText]: + """Identifies narrative text sections that fall under the "risk" heading""" + return self.get_section_narrative(SECSection.RISK_FACTORS) + + def doc_after_cleaners( + self, skip_headers_and_footers=False, skip_table_text=False, inplace=False + ) -> HTMLDocument: + new_doc = super().doc_after_cleaners( + skip_headers_and_footers, skip_table_text, inplace + ) + if not inplace: + # NOTE(alan): Copy filing_type since this attribute isn't in the base class + new_doc.filing_type = self.filing_type + return new_doc + + def _read_xml(self, content): + super()._read_xml(content) + # NOTE(alan): Get filing type from xml since this is not relevant to the base class. + type_tag = self.document_tree.find(".//type") + if type_tag is not None: + self.filing_type = type_tag.text.strip() + return self.document_tree + + def _is_last_section_in_report( + self, section: SECSection, toc: HTMLDocument + ) -> bool: + """Checks to see if the section is the last section in toc for a report types filing.""" + # Note(yuming): This method assume the section already exists in toc. + if self.filing_type in ["10-K", "10-K/A"]: + # try to get FORM_SUMMARY as last section, else then try to get EXHIBITS. + if section == SECSection.FORM_SUMMARY: + return True + if section == SECSection.EXHIBITS: + form_summary_section = first( + el + for el in toc.elements + if is_section_elem(SECSection.FORM_SUMMARY, el, self.filing_type) + ) + # if FORM_SUMMARY is not in toc, the last section is EXHIBITS + if form_summary_section is None: + return True + if self.filing_type in ["10-Q", "10-Q/A"]: + # try to get EXHIBITS as last section. + if section == SECSection.EXHIBITS: + return True + return False + + +def get_narrative_texts( + doc: HTMLDocument, up_to_next_title: Optional[bool] = False +) -> List[Text]: + """Returns a list of NarrativeText or ListItem from document, + with option to return narrative texts only up to next Title element.""" + if up_to_next_title: + narrative_texts = [] + for el in doc.elements: + if isinstance(el, NarrativeText) or isinstance(el, ListItem): + narrative_texts.append(el) + else: + break + return narrative_texts + else: + return [ + el + for el in doc.elements + if isinstance(el, NarrativeText) or isinstance(el, ListItem) + ] + + +def is_section_elem( + section: SECSection, elem: Text, filing_type: Optional[str] +) -> bool: + """Checks to see if a text element matches the section title for a given filing type""" + _raise_for_invalid_filing_type(filing_type) + if section is SECSection.RISK_FACTORS: + return is_risk_title(elem.text, filing_type=filing_type) + else: + + def _is_matching_section_pattern(text): + return bool( + re.search(section.pattern, clean_sec_text(text, lowercase=True)) + ) + + if filing_type in REPORT_TYPES: + return _is_matching_section_pattern( + remove_item_from_section_text(elem.text) + ) + else: + return _is_matching_section_pattern(elem.text) + + +def is_item_title(title: str, filing_type: Optional[str]) -> bool: + """Determines if a title corresponds to an item heading.""" + if filing_type in REPORT_TYPES: + return is_10k_item_title(title) + elif filing_type in S1_TYPES: + return is_s1_section_title(title) + return False + + +def is_risk_title(title: str, filing_type: Optional[str]) -> bool: + """Checks to see if the title matches the pattern for the risk heading.""" + if filing_type in REPORT_TYPES: + return is_10k_risk_title(clean_sec_text(title, lowercase=True)) + elif filing_type in S1_TYPES: + return is_s1_risk_title(clean_sec_text(title, lowercase=True)) + return False + + +def is_toc_title(title: str) -> bool: + """Checks to see if the title matches the pattern for the table of contents.""" + clean_title = clean_sec_text(title, lowercase=True) + return (clean_title == "table of contents") or (clean_title == "index") + + +def is_10k_item_title(title: str) -> bool: + """Determines if a title corresponds to a 10-K item heading.""" + return ITEM_TITLE_RE.match(clean_sec_text(title, lowercase=True)) is not None + + +def is_10k_risk_title(title: str) -> bool: + """Checks to see if the title matches the pattern for the risk heading.""" + return ( + "1a" in title.lower() or "risk factors" in title.lower() + ) and "summary" not in title.lower() + + +def is_s1_section_title(title: str) -> bool: + """Detemines if a title corresponds to a section title.""" + return title.strip().isupper() + + +def is_s1_risk_title(title: str) -> bool: + """Checks to see if the title matches the pattern for the risk heading.""" + return title.strip().lower() == "risk factors" + + +def to_sklearn_format(elements: List[Element]) -> npt.NDArray[np.float32]: + """The input to clustering needs to be locations in euclidean space, so we need to interpret + the locations of Titles within the sequence of elements as locations in 1d space + """ + is_title: npt.NDArray[np.bool_] = np.array( + [is_possible_title(el.text) for el in elements][: len(elements)], dtype=bool + ) + title_locs = np.arange(len(is_title)).astype(np.float32)[is_title].reshape(-1, 1) + return title_locs + + +def cluster_num_to_indices( + num: int, elem_idxs: npt.NDArray[np.float32], res: npt.NDArray[np.int_] +) -> List[int]: + """Keeping in mind the input to clustering was indices in a list of elements interpreted as + location in 1-d space, this function gives back the original indices of elements that are + members of the cluster with the given number. + """ + idxs = elem_idxs[res == num].astype(int).flatten().tolist() + return idxs + + +def first(it: Iterable) -> Any: + """Grabs the first item in an iterator.""" + try: + out = next(iter(it)) + except StopIteration: + out = None + return out + + +def match_s1_toc_title_to_section(text: str, title: str) -> bool: + """Matches an S-1 style title from the table of contents to the associated title in the document + body""" + return text == title + + +def match_10k_toc_title_to_section(text: str, title: str) -> bool: + """Matches a 10-K style title from the table of contents to the associated title in the document + body""" + if re.match(ITEM_TITLE_RE, title): + return text.startswith(title) + else: + text = remove_item_from_section_text(text) + return text.startswith(title) + + +def remove_item_from_section_text(text: str) -> str: + """Removes 'item' heading from section text for 10-K/Q forms as preparation for other matching + techniques""" + return re.sub(ITEM_TITLE_RE, "", text).strip() + + +def get_element_by_title( + elements: Iterator[Element], + title: str, + filing_type: Optional[str], +) -> Optional[Element]: + """Get element from Element list whose text approximately matches title""" + _raise_for_invalid_filing_type(filing_type) + if filing_type in REPORT_TYPES: + match = match_10k_toc_title_to_section + elif filing_type in S1_TYPES: + match = match_s1_toc_title_to_section + return first( + el + for el in elements + if match( + clean_sec_text(el.text, lowercase=True), + clean_sec_text(title, lowercase=True), + ) + ) diff --git a/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/sections.py b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/sections.py new file mode 100644 index 0000000000000000000000000000000000000000..4d35da23d73b0e757e064e9b2c5a7590d768f7ff --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/prepline_sec_filings/sections.py @@ -0,0 +1,154 @@ +"""Module for defining/enumerating the common sections from SEC forms""" +import re +from enum import Enum +from typing import List + + +class SECSection(Enum): + PROSPECTUS_SUMMARY = re.compile(r"^(?:prospectus )?summary$") + ABOUT_PROSPECTUS = re.compile(r"about this prospectus") + FORWARD_LOOKING_STATEMENTS = re.compile(r"forward[ -]looking statements") + RISK_FACTORS = re.compile(r"risk factors") + USE_OF_PROCEEDS = re.compile(r"use of proceeds") + DIVIDEND_POLICY = re.compile(r"^dividend policy") + CAPITALIZATION = re.compile(r"^capitalization$") + DILUTION = re.compile(r"^dilution$") + MANAGEMENT_DISCUSSION = re.compile(r"^management(?:[\u2019']s)? discussion") + BUSINESS = re.compile(r"^business$") + MANAGEMENT = re.compile(r"^(?:(?:our )?management)|(?:executive officers)$") + COMPENSATION = re.compile(r"compensation") + RELATED_PARTY_TRANSACTIONS = re.compile(r"(?:relationships|related).*transactions") + PRINCIPAL_STOCKHOLDERS = re.compile( + r"(?:principal.*(?:stockholder|shareholder)s?)|(?:(security|stock|share) " + r"ownership .*certain)" + ) + DESCRIPTION_OF_STOCK = re.compile( + r"^description of (?:capital stock|share capital|securities)" + ) + DESCRIPTION_OF_DEBT = re.compile(r"^description of .*debt") + FUTURE_SALE = re.compile(r"(?:shares|stock) eligible for future sale") + US_TAX = re.compile( + r"(?:us|u\.s\.|united states|material federal).* tax" + r" (?:consideration|consequence)" + ) + UNDERWRITING = re.compile(r"underwrit") + LEGAL_MATTERS = re.compile(r"legal matters") + EXPERTS = re.compile(r"^experts$") + MORE_INFORMATION = re.compile(r"(?:additional|more) information") + FINANCIAL_STATEMENTS = r"financial statements" + MARKET_RISK_DISCLOSURES = ( + r"(?:quantitative|qualitative) disclosures? about market risk" + ) + CONTROLS_AND_PROCEDURES = r"controls and procedures" + LEGAL_PROCEEDINGS = r"legal proceedings" + DEFAULTS = r"defaults (?:up)?on .*securities" + MINE_SAFETY = r"mine safety disclosures?" + OTHER_INFORMATION = r"other information" + UNRESOLVED_STAFF_COMMENTS = r"unresolved staff comments" + PROPERTIES = r"^properties$" + MARKET_FOR_REGISTRANT_COMMON_EQUITY = ( + r"market for(?: the)? (?:registrant|company)(?:['\u2019]s)? common equity" + ) + ACCOUNTING_DISAGREEMENTS = r"disagreements with accountants" + FOREIGN_JURISDICTIONS = r"diclosure .*foreign jurisdictions .*inspection" + EXECUTIVE_OFFICERS = r"executive officers" + ACCOUNTING_FEES = r"accounting fees" + EXHIBITS = r"^exhibits?(.*financial statement schedules)?$" + FORM_SUMMARY = r"^form .*summary$" + # NOTE(yuming): Additional section titles used in test_real_examples.py, + # maybe change this when custom regex string param is allowed. + CERTAIN_TRADEMARKS = r"certain trademarks" + OFFER_PRICE = r"(?:determination of )offering price" + + @property + def pattern(self): + return self.value + + +ALL_SECTIONS = "_ALL" + +section_string_to_enum = {enum.name: enum for enum in SECSection} + +# NOTE(robinson) - Sections are listed in the following document from SEC +# ref: https://www.sec.gov/files/form10-k.pdf +SECTIONS_10K = ( + SECSection.BUSINESS, # ITEM 1 + SECSection.RISK_FACTORS, # ITEM 1A + SECSection.UNRESOLVED_STAFF_COMMENTS, # ITEM 1B + SECSection.PROPERTIES, # ITEM 2 + SECSection.LEGAL_PROCEEDINGS, # ITEM 3 + SECSection.MINE_SAFETY, # ITEM 4 + SECSection.MARKET_FOR_REGISTRANT_COMMON_EQUITY, # ITEM 5 + # NOTE(robinson) - ITEM 6 is "RESERVED" + SECSection.MANAGEMENT_DISCUSSION, # ITEM 7 + SECSection.MARKET_RISK_DISCLOSURES, # ITEM 7A + SECSection.FINANCIAL_STATEMENTS, # ITEM 8 + SECSection.ACCOUNTING_DISAGREEMENTS, # ITEM 9 + SECSection.CONTROLS_AND_PROCEDURES, # ITEM 9A + # NOTE(robinson) - ITEM 9B is other information + SECSection.FOREIGN_JURISDICTIONS, # ITEM 9C + SECSection.MANAGEMENT, # ITEM 10 + SECSection.COMPENSATION, # ITEM 11 + SECSection.PRINCIPAL_STOCKHOLDERS, # ITEM 12 + SECSection.RELATED_PARTY_TRANSACTIONS, # ITEM 13 + SECSection.ACCOUNTING_FEES, # ITEM 14 + SECSection.EXHIBITS, # ITEM 15 + SECSection.FORM_SUMMARY, # ITEM 16 +) + +# NOTE(robinson) - Sections are listed in the following document from SEC +# ref: https://www.sec.gov/files/form10-q.pdf +SECTIONS_10Q = ( + # Part I - Financial information + SECSection.FINANCIAL_STATEMENTS, # ITEM 1 + SECSection.MANAGEMENT_DISCUSSION, # ITEM 2 + SECSection.MARKET_RISK_DISCLOSURES, # ITEM 3 + SECSection.CONTROLS_AND_PROCEDURES, # ITEM 4 + # Part II - Other information + SECSection.LEGAL_PROCEEDINGS, # ITEM 1 + SECSection.RISK_FACTORS, # ITEM 1A + SECSection.USE_OF_PROCEEDS, # ITEM 2 + SECSection.DEFAULTS, # ITEM 3 + SECSection.MINE_SAFETY, # ITEM 4 + SECSection.OTHER_INFORMATION, # ITEM 5 +) + +SECTIONS_S1 = ( + SECSection.PROSPECTUS_SUMMARY, + SECSection.ABOUT_PROSPECTUS, + SECSection.FORWARD_LOOKING_STATEMENTS, + SECSection.RISK_FACTORS, + SECSection.USE_OF_PROCEEDS, + SECSection.DIVIDEND_POLICY, + SECSection.CAPITALIZATION, + SECSection.DILUTION, + SECSection.MANAGEMENT_DISCUSSION, + SECSection.BUSINESS, + SECSection.MANAGEMENT, + SECSection.COMPENSATION, + SECSection.RELATED_PARTY_TRANSACTIONS, + SECSection.PRINCIPAL_STOCKHOLDERS, + SECSection.DESCRIPTION_OF_STOCK, + SECSection.DESCRIPTION_OF_DEBT, + SECSection.FUTURE_SALE, + SECSection.US_TAX, + SECSection.UNDERWRITING, + SECSection.LEGAL_MATTERS, + SECSection.EXPERTS, + SECSection.MORE_INFORMATION, +) + + +def validate_section_names(section_names: List[str]): + """Return section names that don't correspond to a defined enum.""" + if len(section_names) == 1 and section_names[0] == ALL_SECTIONS: + return None + elif len(section_names) > 1 and ALL_SECTIONS in section_names: + raise ValueError(f"{ALL_SECTIONS} may not be specified with other sections") + + invalid_names = [ + name for name in section_names if name not in section_string_to_enum + ] + if invalid_names: + raise ValueError(f"The following section names are not valid: {invalid_names}") + return None diff --git a/FinNLP/finnlp/data_sources/sec_filings/sec_filings.py b/FinNLP/finnlp/data_sources/sec_filings/sec_filings.py new file mode 100644 index 0000000000000000000000000000000000000000..6f3a4ed0804a31045b64b9835409438f4ffd6e54 --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/sec_filings.py @@ -0,0 +1,296 @@ +from typing import Any, Dict, List + +from finnlp.data_sources.sec_filings.prepline_sec_filings.sec_document import ( + REPORT_TYPES, + VALID_FILING_TYPES, + SECDocument, +) +from finnlp.data_sources.sec_filings.prepline_sec_filings.sections import ( + ALL_SECTIONS, + SECTIONS_10K, + SECTIONS_10Q, + SECTIONS_S1, + section_string_to_enum, + validate_section_names, +) +from finnlp.data_sources.sec_filings.utils import get_filing_urls_to_download + +import re +import signal +from datetime import date +from enum import Enum +from typing import Optional +import requests +from ratelimit import limits, sleep_and_retry +import os + +try: + from unstructured.staging.base import convert_to_isd +except Exception: + + class Element: + pass + + def convert_to_isd(elements: List[Element]) -> List[Dict[str, Any]]: + """Represents the document elements as an Initial Structured Document (ISD).""" + isd: List[Dict[str, str]] = [] + for element in elements: + section = element.to_dict() + isd.append(section) + return isd + + +DATE_FORMAT_TOKENS = "%Y-%m-%d" +DEFAULT_BEFORE_DATE = date.today().strftime(DATE_FORMAT_TOKENS) +DEFAULT_AFTER_DATE = date(2000, 1, 1).strftime(DATE_FORMAT_TOKENS) + + +class timeout: + def __init__(self, seconds=1, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + try: + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + except ValueError: + pass + + def __exit__(self, type, value, traceback): + try: + signal.alarm(0) + except ValueError: + pass + + +# pipeline-api +def get_regex_enum(section_regex): + """Get sections using regular expression + + Args: + section_regex (str): regular expression for the section name + + Returns: + CustomSECSection.CUSTOM: Custom regex section name + """ + + class CustomSECSection(Enum): + CUSTOM = re.compile(section_regex) + + @property + def pattern(self): + return self.value + + return CustomSECSection.CUSTOM + + +class SECExtractor: + def __init__( + self, + tickers: List[str], + amount: int, + filing_type: str, + start_date: str = DEFAULT_AFTER_DATE, + end_date: str = DEFAULT_BEFORE_DATE, + sections: List[str] = ["_ALL"], + include_amends: bool = True, + ): + """_summary_ + + Args: + tickers (List[str]): list of ticker + amount (int): amount of documenteds + filing_type (str): 10-K or 10-Q + start_date (str, optional): start date of getting files. Defaults to DEFAULT_AFTER_DATE. + end_date (str, optional): end date of getting files. Defaults to DEFAULT_BEFORE_DATE. + sections (List[str], optional): sections required, check sections names. Defaults to ["_ALL"]. + """ + self.tickers = tickers + self.amount = amount + self.filing_type = filing_type + self.start_date = start_date + self.end_date = end_date + self.sections = sections + self.include_amends = include_amends + + def get_accession_numbers(self, tic: str) -> dict: + """Get accession numbers and download URL for the SEC filing + + Args: + tic (str): ticker symbol + + Returns: + dict: final dictionary for all the urls and years mentioned + """ + final_dict = {} + filing_metadata = get_filing_urls_to_download( + self.filing_type, + tic, + self.amount, + self.start_date, + self.end_date, + include_amends=self.include_amends, + ) + # fm.append(filing_metadata) + acc_nums_yrs = [ + [ + self.get_year(fm.filing_details_url), + fm.accession_number.replace("-", ""), + fm.full_submission_url, + ] + for fm in filing_metadata + ] + for idx, fm in enumerate(acc_nums_yrs[:-1]): + if fm[0] is None: + fm[0] = acc_nums_yrs[idx + 1][0] + for acy in acc_nums_yrs: + if tic not in final_dict: + final_dict.update({tic: []}) + final_dict[tic].append( + {"year": acy[0], "accession_number": acy[1], "url": acy[2]} + ) + return final_dict + + def get_year(self, filing_details: str) -> str: + """Get the year for 10-K and year,month for 10-Q + + Args: + filing_details (str): filing url + + Returns: + str: year for 10-K and year,month for 10-Q + """ + details = filing_details.split("/")[-1] + if self.filing_type == "10-K": + matches = re.findall("20\d{2}", details) + elif self.filing_type == "10-Q": + matches = re.findall("20\d{4}", details) + + if matches: + return matches[-1] # Return the first match + else: + return None # In case no match is found + + def get_all_text(self, section, all_narratives): + """Join all the text from a section + + Args: + section (str): section name + all_narratives (dict): dictionary of section names and text + + Returns: + _type_: _description_ + """ + all_texts = [] + for text_dict in all_narratives[section]: + for key, val in text_dict.items(): + if key == "text": + all_texts.append(val) + return " ".join(all_texts) + + def get_text_from_url(self, url: str): + """Get the text from filing document URL + + Args: + url (str): url link + + Returns: + _type_: all texts of sections and filing type of the document + """ + text = self.get_filing( + url, company="Unstructured Technologies", email="support@unstructured.io" + ) + all_narratives, filing_type = self.pipeline_api(text, m_section=self.sections) + all_narrative_dict = dict.fromkeys(all_narratives.keys()) + + for section in all_narratives: + all_narrative_dict[section] = self.get_all_text(section, all_narratives) + + return all_narrative_dict, filing_type + + def pipeline_api(self, text, m_section=[], m_section_regex=[]): + """Unsturcured API to get the text + + Args: + text (str): Text from the filing document URL + m_section (list, optional): Section required. Defaults to []. + m_section_regex (list, optional): Custom Section required using regex . Defaults to []. + + Raises: + ValueError: Invalid document names + ValueError: Invalid section names + + Returns: + section and correspoding texts + """ + validate_section_names(m_section) + + sec_document = SECDocument.from_string(text) + if sec_document.filing_type not in VALID_FILING_TYPES: + raise ValueError( + f"SEC document filing type {sec_document.filing_type} is not supported," + f" must be one of {','.join(VALID_FILING_TYPES)}" + ) + results = {} + if m_section == [ALL_SECTIONS]: + filing_type = sec_document.filing_type + if filing_type in REPORT_TYPES: + if filing_type.startswith("10-K"): + m_section = [enum.name for enum in SECTIONS_10K] + elif filing_type.startswith("10-Q"): + m_section = [enum.name for enum in SECTIONS_10Q] + else: + raise ValueError(f"Invalid report type: {filing_type}") + + else: + m_section = [enum.name for enum in SECTIONS_S1] + for section in m_section: + results[section] = sec_document.get_section_narrative( + section_string_to_enum[section] + ) + + for i, section_regex in enumerate(m_section_regex): + regex_num = get_regex_enum(section_regex) + with timeout(seconds=5): + section_elements = sec_document.get_section_narrative(regex_num) + results[f"REGEX_{i}"] = section_elements + return { + section: convert_to_isd(section_narrative) + for section, section_narrative in results.items() + }, sec_document.filing_type + + @sleep_and_retry + @limits(calls=10, period=1) + def get_filing(self, url: str, company: str, email: str) -> str: + """Fetches the specified filing from the SEC EDGAR Archives. Conforms to the rate + limits specified on the SEC website. + ref: https://www.sec.gov/os/accessing-edgar-data""" + session = self._get_session(company, email) + response = session.get(url) + response.raise_for_status() + return response.text + + def _get_session( + self, company: Optional[str] = None, email: Optional[str] = None + ) -> requests.Session: + """Creates a requests sessions with the appropriate headers set. If these headers are not + set, SEC will reject your request. + ref: https://www.sec.gov/os/accessing-edgar-data""" + if company is None: + company = os.environ.get("SEC_API_ORGANIZATION") + if email is None: + email = os.environ.get("SEC_API_EMAIL") + assert company + assert email + session = requests.Session() + session.headers.update( + { + "User-Agent": f"{company} {email}", + "Content-Type": "text/html", + } + ) + return session diff --git a/FinNLP/finnlp/data_sources/sec_filings/utils.py b/FinNLP/finnlp/data_sources/sec_filings/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..14b38b52196a4852601e02cfbcb14bb7c7d32356 --- /dev/null +++ b/FinNLP/finnlp/data_sources/sec_filings/utils.py @@ -0,0 +1,199 @@ +import time +from collections import namedtuple +from pathlib import Path +from typing import List + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +from faker import Faker + +fake = Faker() + +MAX_RETRIES = 10 +SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1 +FILING_DETAILS_FILENAME_STEM = "filing-details" +SEC_EDGAR_SEARCH_API_ENDPOINT = "https://efts.sec.gov/LATEST/search-index" +SEC_EDGAR_ARCHIVES_BASE_URL = "https://www.sec.gov/Archives/edgar/data" + +retries = Retry( + total=MAX_RETRIES, + backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL, + status_forcelist=[403, 500, 502, 503, 504], +) + +FilingMetadata = namedtuple( + "FilingMetadata", + [ + "accession_number", + "full_submission_url", + "filing_details_url", + "filing_details_filename", + ], +) + + +class EdgarSearchApiError(Exception): + pass + + +def form_request_payload( + ticker_or_cik: str, + filing_types: List[str], + start_date: str, + end_date: str, + start_index: int, + query: str, +) -> dict: + payload = { + "dateRange": "custom", + "startdt": start_date, + "enddt": end_date, + "entityName": ticker_or_cik, + "forms": filing_types, + "from": start_index, + "q": query, + } + return payload + + +def build_filing_metadata_from_hit(hit: dict) -> FilingMetadata: + accession_number, filing_details_filename = hit["_id"].split(":", 1) + # Company CIK should be last in the CIK list. This list may also include + # the CIKs of executives carrying out insider transactions like in form 4. + cik = hit["_source"]["ciks"][-1] + accession_number_no_dashes = accession_number.replace("-", "", 2) + + submission_base_url = ( + f"{SEC_EDGAR_ARCHIVES_BASE_URL}/{cik}/{accession_number_no_dashes}" + ) + + full_submission_url = f"{submission_base_url}/{accession_number}.txt" + + # Get XSL if human readable is wanted + # XSL is required to download the human-readable + # and styled version of XML documents like form 4 + # SEC_EDGAR_ARCHIVES_BASE_URL + /320193/000032019320000066/wf-form4_159839550969947.xml + # SEC_EDGAR_ARCHIVES_BASE_URL + + # /320193/000032019320000066/xslF345X03/wf-form4_159839550969947.xml + + # xsl = hit["_source"]["xsl"] + # if xsl is not None: + # filing_details_url = f"{submission_base_url}/{xsl}/{filing_details_filename}" + # else: + # filing_details_url = f"{submission_base_url}/{filing_details_filename}" + + filing_details_url = f"{submission_base_url}/{filing_details_filename}" + + filing_details_filename_extension = Path(filing_details_filename).suffix.replace( + "htm", "html" + ) + filing_details_filename = ( + f"{FILING_DETAILS_FILENAME_STEM}{filing_details_filename_extension}" + ) + + return FilingMetadata( + accession_number=accession_number, + full_submission_url=full_submission_url, + filing_details_url=filing_details_url, + filing_details_filename=filing_details_filename, + ) + + +def generate_random_user_agent() -> str: + return f"{fake.first_name()} {fake.last_name()} {fake.email()}" + + +def get_filing_urls_to_download( + filing_type: str, + ticker_or_cik: str, + num_filings_to_download: int, + after_date: str, + before_date: str, + include_amends: bool, + query: str = "", +) -> List[FilingMetadata]: + """Get the filings URL to download the data + + Returns: + List[FilingMetadata]: Filing metadata from SEC + """ + filings_to_fetch: List[FilingMetadata] = [] + start_index = 0 + client = requests.Session() + client.mount("http://", HTTPAdapter(max_retries=retries)) + client.mount("https://", HTTPAdapter(max_retries=retries)) + try: + while len(filings_to_fetch) < num_filings_to_download: + payload = form_request_payload( + ticker_or_cik, + [filing_type], + after_date, + before_date, + start_index, + query, + ) + headers = { + "User-Agent": generate_random_user_agent(), + "Accept-Encoding": "gzip, deflate", + "Host": "efts.sec.gov", + } + resp = client.post( + SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, headers=headers + ) + resp.raise_for_status() + search_query_results = resp.json() + + if "error" in search_query_results: + try: + root_cause = search_query_results["error"]["root_cause"] + if not root_cause: # pragma: no cover + raise ValueError + + error_reason = root_cause[0]["reason"] + raise EdgarSearchApiError( + f"Edgar Search API encountered an error: {error_reason}. " + f"Request payload:\n{payload}" + ) + except (ValueError, KeyError): # pragma: no cover + raise EdgarSearchApiError( + "Edgar Search API encountered an unknown error. " + f"Request payload:\n{payload}" + ) from None + + query_hits = search_query_results["hits"]["hits"] + + # No more results to process + if not query_hits: + break + + for hit in query_hits: + hit_filing_type = hit["_source"]["file_type"] + + is_amend = hit_filing_type[-2:] == "/A" + if not include_amends and is_amend: + continue + if is_amend: + num_filings_to_download += 1 + # Work around bug where incorrect filings are sometimes included. + # For example, AAPL 8-K searches include N-Q entries. + if not is_amend and hit_filing_type != filing_type: + continue + + metadata = build_filing_metadata_from_hit(hit) + filings_to_fetch.append(metadata) + + if len(filings_to_fetch) == num_filings_to_download: + return filings_to_fetch + + # Edgar queries 100 entries at a time, but it is best to set this + # from the response payload in case it changes in the future + query_size = search_query_results["query"]["size"] + start_index += query_size + + # Prevent rate limiting + time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL) + finally: + client.close() + + return filings_to_fetch diff --git a/FinNLP/finnlp/data_sources/social_media/__init__.py b/FinNLP/finnlp/data_sources/social_media/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/data_sources/social_media/_base.py b/FinNLP/finnlp/data_sources/social_media/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..584e663eee798f1d42a3ea823d43b270cca23540 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/_base.py @@ -0,0 +1,19 @@ +from finnlp.data_sources._base import FinNLP_Downloader + +class Social_Media_Downloader(FinNLP_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + pass + + def download(self, start_date, end_date, stock = "all"): + pass + + def clean_data(self): + pass + + def gather_one_day_news(self,date,stock = "all",delay = 0.1): + pass + + def transfer_standard_date_to_nonstandard(self,date): + pass \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/social_media/eastmoney_streaming.py b/FinNLP/finnlp/data_sources/social_media/eastmoney_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..9478aa92732c23ab3483d3af266ecb454d3eafb2 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/eastmoney_streaming.py @@ -0,0 +1,43 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +# TODO: +# 1. Contents + +class Eastmoney_Streaming(Social_Media_Downloader): + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_stock(self, keyword = "600519", rounds = 3, delay = 0.5): + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + } + print('Downloading ...', end =' ') + for page in range(rounds): + url = f"https://guba.eastmoney.com/list,{keyword}_{page+1}.html" + res = requests.get(url=url, headers=headers) + if res.status_code != 200: + break + + res = etree.HTML(res.text) + res = res.xpath("//script")[3].xpath("text()")[0] + article_list, other_list = res.split('var article_list=')[1].strip(";").split('; var other_list=') + article_list = json.loads(article_list) + tmp = pd.DataFrame(article_list['re']) + self.dataframe = pd.concat([self.dataframe, tmp]) + + print(page, end =' ') + time.sleep(delay) + + self.dataframe = self.dataframe.reset_index(drop= True) + + + \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/social_media/facebook_streaming.py b/FinNLP/finnlp/data_sources/social_media/facebook_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..4f22f9224f97aba460df2ab1b89aca69fb05ede0 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/facebook_streaming.py @@ -0,0 +1,103 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +# TODO: +# 1. Better performance + +import json +import time +import numpy as np + +from selenium import webdriver +from selenium.webdriver.common.by import By + +class Facebook_Streaming(Social_Media_Downloader): + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + self.cookies = args["cookies"] + self.stealth_path = args["stealth_path"] + self.headless = args["headless"] if "headless" in args.keys() else True + + def download_streaming_stock(self, keyword = "AAPL", rounds = 3, delay = 0.5): + # init + self._init_opt() + + # search for the keyword + search_url = "https://m.facebook.com/search_results/?q=" + keyword + self.browser.get(search_url) + + # click on the posts + post_element = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div[3]/div[1]")[0] + post_element.click() + time.sleep(5) + + # click on recent posts + post_element = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div[3]/div[1]")[0] + post_element.click() + time.sleep(5) + + # get data + all = [] + title_divs = self.browser.find_elements(By.XPATH, "/html/body/div[2]/div/div[2]/div") + for title_div in tqdm(title_divs): + # title + try: + title = title_div.find_elements(By.XPATH,"./div[2]/div/div/div[2]/div/div/div/div") + if len(title)>0: + title = title[0].text + else: + title = np.nan + except Exception as e: + print(e) + title = np.nan + + # time + try: + time_element = title_div.find_elements(By.XPATH, './div[2]/div/div/div[1]/div/div/div/div[2]/div[2]/div/span') + if len(time_element)>0: + time_ = time_element[0].text + else: + time_ = np.nan + except: + time_ = np.nan + all.append((title, time_)) + + # close browser + self.browser.close() + + tmp = pd.DataFrame(all, columns=["content", "date"]) + self.dataframe = pd.concat([self.dataframe, tmp]) + self.dataframe = self.dataframe.dropna(how="all") + + print("Only support the first page now!") + + + def _init_opt(self): + self.chromeOptions = webdriver.ChromeOptions() + if self.headless: + self.chromeOptions.add_argument('--headless') + self.chromeOptions.add_argument('--disable-blink-features=AutomationControlled') + self.chromeOptions.add_argument("--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1") + + self.chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) + self.browser = webdriver.Chrome(options=self.chromeOptions) + with open(self.stealth_path) as f: + js = f.read() + self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { + "source": js + }) + self.browser.get('https://m.facebook.com/') + self.browser.delete_all_cookies() + for i in self.cookies: + self.browser.add_cookie(i) + + self.browser.implicitly_wait(2) + diff --git a/FinNLP/finnlp/data_sources/social_media/finnhub_sentiment.py b/FinNLP/finnlp/data_sources/social_media/finnhub_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..97ad9a1d6a691f89e1b513d0f826716056c8455a --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/finnhub_sentiment.py @@ -0,0 +1,53 @@ +from finnlp.data_sources.social_media._base import Social_Media_Downloader +from tqdm.notebook import tqdm +import pandas as pd +import finnhub +import time + +class Finnhub_Sentiment(Social_Media_Downloader): + def __init__(self, args = {}): + super().__init__(args) + assert "token" in args.keys(), "Please input your finnhub token. Avaliable at https://finnhub.io/dashboard" + self.finnhub_client = finnhub.Client(api_key=args["token"]) + self.delay = args["delay"] if "dalay" in args.keys() else 0.7 + + def download_sentiment(self, start_date, end_date, stock = "APPL"): + self.reddit = pd.DataFrame() + self.twitter = pd.DataFrame() + self.date_list = pd.date_range(start_date,end_date) + days_each_time = 4 + date_list = self.date_list + # cal total lenth + if len(date_list)%days_each_time == 0: + total = len(date_list)//days_each_time + else: + total = len(date_list)//days_each_time+1 + with tqdm(total=total) as bar: + while len(date_list): + tmp_date_list = date_list[:days_each_time] + date_list = date_list[days_each_time:] + tmp_start_date = tmp_date_list[0].strftime("%Y-%m-%d") + tmp_end_date = tmp_date_list[-1].strftime("%Y-%m-%d") + reddit, _stock_name, twitter = self.gather_one_day_sentiment(tmp_start_date,tmp_end_date,stock = stock ) + self.reddit = pd.concat([self.reddit,reddit]) + self.twitter = pd.concat([self.twitter,twitter]) + bar.update(1) + self.reddit = self.reddit.sort_values("atTime") + self.twitter = self.twitter.sort_values("atTime") + + def gather_one_day_sentiment(self,start_date, end_date, stock = "APPL"): + res = self.finnhub_client.stock_social_sentiment(stock, _from=start_date, to=end_date) + reddit = res["reddit"] + symbol = res["symbol"] + twitter = res["twitter"] + reddit = pd.DataFrame(reddit) + # print(reddit) + + twitter = pd.DataFrame(twitter) + try: + reddit["atTime"] = pd.to_datetime(reddit["atTime"],errors = "ignore") + twitter["atTime"] = pd.to_datetime(twitter["atTime"],errors = "ignore") + except: + pass + time.sleep(self.delay) + return reddit,symbol,twitter diff --git a/FinNLP/finnlp/data_sources/social_media/reddit_streaming.py b/FinNLP/finnlp/data_sources/social_media/reddit_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..c1dd1ad1a79e256954b5b38441671ea46b0d5f6d --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/reddit_streaming.py @@ -0,0 +1,96 @@ +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +from tqdm import tqdm +from lxml import etree +import requests +import pandas as pd +import json +import base64 + +class Reddit_Streaming(Social_Media_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_all(self, rounds = 3): + # Download the first page by url + base_url = "https://www.reddit.com/r/wallstreetbets/new/" + pbar = tqdm(total= rounds, desc= "Downloading by pages...") + res = self._request_get(base_url) + if res is None: + raise ConnectionError + + # get the info from init page + html = etree.HTML(res.text) + init = html.xpath("//*[@id='data']/text()")[0] + init = json.loads(init[14:][:-1]) + init = init["posts"]["models"] + tmp_df = pd.DataFrame(init).T.reset_index(drop = True) + self.dataframe = tmp_df + init = [i for i in init if len(i)< 12] + last_id = init[-1] + last_id = self._encode_base64(last_id) + + pbar.update(1) + + # fetch other pages + if rounds > 1: + for _ in range(1,rounds): + last_id = self._fatch_other_pages(last_id, pbar) + + def _fatch_other_pages(self, last_page, pbar): + url = 'https://gql.reddit.com/' + headers = { + "referer":"https://www.reddit.com/", + "authorization": "Bearer -twjFZkBAlpR8gZnZqsGHvz-G5c49PA", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36" + } + data = { + "id": "02e3b6d0d0d7", + "variables": { + "name": "wallstreetbets", + "includeIdentity": False, + "adContext": { + "layout": "CARD", + "clientSignalSessionData": { + "adsSeenCount": 4, + "totalPostsSeenCount": 79, + "sessionStartTime": "2023-04-07T15:32:13.933Z", + } + }, + "isFake": False, + "includeAppliedFlair": False, + "includeDevPlatformMetadata": True, + "includeRecents": False, + "includeTrending": False, + "includeSubredditRankings": True, + "includeSubredditChannels": False, + "isAdHocMulti": False, + "isAll": False, + "isLoggedOutGatedOptedin": False, + "isLoggedOutQuarantineOptedin": False, + "isPopular": False, + "recentPostIds": [], + "subredditNames": [], + "sort": "NEW", + "pageSize": 25, + "after": last_page + } + } + response = self._request_post(url = url, headers= headers, json = data) + data = json.loads(response.text) + data = data["data"]["subredditInfoByName"]["elements"]["edges"] + for d in data: + if d["node"]["__typename"] == "SubredditPost": + tmp = pd.DataFrame(d).T + self.dataframe = pd.concat([self.dataframe, tmp]) + last_id = tmp.id.values[0] + + last_id = self._encode_base64(last_id) + pbar.update(1) + + return last_id + + def _encode_base64(self,id): + return base64.b64encode(id.encode('utf-8')).decode() \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/social_media/stealth.min.js b/FinNLP/finnlp/data_sources/social_media/stealth.min.js new file mode 100644 index 0000000000000000000000000000000000000000..f35256283e4c7f2a7ec140ccec41db3c71c291c0 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/stealth.min.js @@ -0,0 +1,7 @@ +/*! + * Note: Auto-generated, do not update manually. + * Generated by: https://github.com/berstend/puppeteer-extra/tree/master/packages/extract-stealth-evasions + * Generated on: Sat, 06 Aug 2022 22:10:53 GMT + * License: MIT + */ +(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:'utils => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, \'chrome\')`\n Object.defineProperty(window, \'chrome\', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We\'ll extend that later\n })\n }\n\n // That means we\'re running headful and don\'t need to mock anything\n if (\'app\' in window.chrome) {\n return // Nothing to do here\n }\n\n const makeError = {\n ErrorInInvocation: fn => {\n const err = new TypeError(`Error in invocation of app.${fn}()`)\n return utils.stripErrorWithAnchor(\n err,\n `at ${fn} (eval at `\n )\n }\n }\n\n // There\'s a some static data in that property which doesn\'t seem to change,\n // we should periodically check for updates: `JSON.stringify(window.app, null, 2)`\n const STATIC_DATA = JSON.parse(\n `\n{\n "isInstalled": false,\n "InstallState": {\n "DISABLED": "disabled",\n "INSTALLED": "installed",\n "NOT_INSTALLED": "not_installed"\n },\n "RunningState": {\n "CANNOT_RUN": "cannot_run",\n "READY_TO_RUN": "ready_to_run",\n "RUNNING": "running"\n }\n}\n `.trim()\n )\n\n window.chrome.app = {\n ...STATIC_DATA,\n\n get isInstalled() {\n return false\n },\n\n getDetails: function getDetails() {\n if (arguments.length) {\n throw makeError.ErrorInInvocation(`getDetails`)\n }\n return null\n },\n getIsInstalled: function getDetails() {\n if (arguments.length) {\n throw makeError.ErrorInInvocation(`getIsInstalled`)\n }\n return false\n },\n runningState: function getDetails() {\n if (arguments.length) {\n throw makeError.ErrorInInvocation(`runningState`)\n }\n return \'cannot_run\'\n }\n }\n utils.patchToStringNested(window.chrome.app)\n }',_args:[]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"utils => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`\n Object.defineProperty(window, 'chrome', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We'll extend that later\n })\n }\n\n // That means we're running headful and don't need to mock anything\n if ('csi' in window.chrome) {\n return // Nothing to do here\n }\n\n // Check that the Navigation Timing API v1 is available, we need that\n if (!window.performance || !window.performance.timing) {\n return\n }\n\n const { timing } = window.performance\n\n window.chrome.csi = function() {\n return {\n onloadT: timing.domContentLoadedEventEnd,\n startE: timing.navigationStart,\n pageT: Date.now() - timing.navigationStart,\n tran: 15 // Transition type or something\n }\n }\n utils.patchToString(window.chrome.csi)\n }",_args:[]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts }) => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`\n Object.defineProperty(window, 'chrome', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We'll extend that later\n })\n }\n\n // That means we're running headful and don't need to mock anything\n if ('loadTimes' in window.chrome) {\n return // Nothing to do here\n }\n\n // Check that the Navigation Timing API v1 + v2 is available, we need that\n if (\n !window.performance ||\n !window.performance.timing ||\n !window.PerformancePaintTiming\n ) {\n return\n }\n\n const { performance } = window\n\n // Some stuff is not available on about:blank as it requires a navigation to occur,\n // let's harden the code to not fail then:\n const ntEntryFallback = {\n nextHopProtocol: 'h2',\n type: 'other'\n }\n\n // The API exposes some funky info regarding the connection\n const protocolInfo = {\n get connectionInfo() {\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ntEntry.nextHopProtocol\n },\n get npnNegotiatedProtocol() {\n // NPN is deprecated in favor of ALPN, but this implementation returns the\n // HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)\n ? ntEntry.nextHopProtocol\n : 'unknown'\n },\n get navigationType() {\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ntEntry.type\n },\n get wasAlternateProtocolAvailable() {\n // The Alternate-Protocol header is deprecated in favor of Alt-Svc\n // (https://www.mnot.net/blog/2016/03/09/alt-svc), so technically this\n // should always return false.\n return false\n },\n get wasFetchedViaSpdy() {\n // SPDY is deprecated in favor of HTTP/2, but this implementation returns\n // true for HTTP/2 or HTTP2+QUIC/39 as well.\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)\n },\n get wasNpnNegotiated() {\n // NPN is deprecated in favor of ALPN, but this implementation returns true\n // for HTTP/2 or HTTP2+QUIC/39 requests negotiated via ALPN.\n const ntEntry =\n performance.getEntriesByType('navigation')[0] || ntEntryFallback\n return ['h2', 'hq'].includes(ntEntry.nextHopProtocol)\n }\n }\n\n const { timing } = window.performance\n\n // Truncate number to specific number of decimals, most of the `loadTimes` stuff has 3\n function toFixed(num, fixed) {\n var re = new RegExp('^-?\\\\d+(?:.\\\\d{0,' + (fixed || -1) + '})?')\n return num.toString().match(re)[0]\n }\n\n const timingInfo = {\n get firstPaintAfterLoadTime() {\n // This was never actually implemented and always returns 0.\n return 0\n },\n get requestTime() {\n return timing.navigationStart / 1000\n },\n get startLoadTime() {\n return timing.navigationStart / 1000\n },\n get commitLoadTime() {\n return timing.responseStart / 1000\n },\n get finishDocumentLoadTime() {\n return timing.domContentLoadedEventEnd / 1000\n },\n get finishLoadTime() {\n return timing.loadEventEnd / 1000\n },\n get firstPaintTime() {\n const fpEntry = performance.getEntriesByType('paint')[0] || {\n startTime: timing.loadEventEnd / 1000 // Fallback if no navigation occured (`about:blank`)\n }\n return toFixed(\n (fpEntry.startTime + performance.timeOrigin) / 1000,\n 3\n )\n }\n }\n\n window.chrome.loadTimes = function() {\n return {\n ...protocolInfo,\n ...timingInfo\n }\n }\n utils.patchToString(window.chrome.loadTimes)\n }",_args:[{opts:{}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts, STATIC_DATA }) => {\n if (!window.chrome) {\n // Use the exact property descriptor found in headful Chrome\n // fetch it via `Object.getOwnPropertyDescriptor(window, 'chrome')`\n Object.defineProperty(window, 'chrome', {\n writable: true,\n enumerable: true,\n configurable: false, // note!\n value: {} // We'll extend that later\n })\n }\n\n // That means we're running headful and don't need to mock anything\n const existsAlready = 'runtime' in window.chrome\n // `chrome.runtime` is only exposed on secure origins\n const isNotSecure = !window.location.protocol.startsWith('https')\n if (existsAlready || (isNotSecure && !opts.runOnInsecureOrigins)) {\n return // Nothing to do here\n }\n\n window.chrome.runtime = {\n // There's a bunch of static data in that property which doesn't seem to change,\n // we should periodically check for updates: `JSON.stringify(window.chrome.runtime, null, 2)`\n ...STATIC_DATA,\n // `chrome.runtime.id` is extension related and returns undefined in Chrome\n get id() {\n return undefined\n },\n // These two require more sophisticated mocks\n connect: null,\n sendMessage: null\n }\n\n const makeCustomRuntimeErrors = (preamble, method, extensionId) => ({\n NoMatchingSignature: new TypeError(\n preamble + `No matching signature.`\n ),\n MustSpecifyExtensionID: new TypeError(\n preamble +\n `${method} called from a webpage must specify an Extension ID (string) for its first argument.`\n ),\n InvalidExtensionID: new TypeError(\n preamble + `Invalid extension id: '${extensionId}'`\n )\n })\n\n // Valid Extension IDs are 32 characters in length and use the letter `a` to `p`:\n // https://source.chromium.org/chromium/chromium/src/+/master:components/crx_file/id_util.cc;drc=14a055ccb17e8c8d5d437fe080faba4c6f07beac;l=90\n const isValidExtensionID = str =>\n str.length === 32 && str.toLowerCase().match(/^[a-p]+$/)\n\n /** Mock `chrome.runtime.sendMessage` */\n const sendMessageHandler = {\n apply: function(target, ctx, args) {\n const [extensionId, options, responseCallback] = args || []\n\n // Define custom errors\n const errorPreamble = `Error in invocation of runtime.sendMessage(optional string extensionId, any message, optional object options, optional function responseCallback): `\n const Errors = makeCustomRuntimeErrors(\n errorPreamble,\n `chrome.runtime.sendMessage()`,\n extensionId\n )\n\n // Check if the call signature looks ok\n const noArguments = args.length === 0\n const tooManyArguments = args.length > 4\n const incorrectOptions = options && typeof options !== 'object'\n const incorrectResponseCallback =\n responseCallback && typeof responseCallback !== 'function'\n if (\n noArguments ||\n tooManyArguments ||\n incorrectOptions ||\n incorrectResponseCallback\n ) {\n throw Errors.NoMatchingSignature\n }\n\n // At least 2 arguments are required before we even validate the extension ID\n if (args.length < 2) {\n throw Errors.MustSpecifyExtensionID\n }\n\n // Now let's make sure we got a string as extension ID\n if (typeof extensionId !== 'string') {\n throw Errors.NoMatchingSignature\n }\n\n if (!isValidExtensionID(extensionId)) {\n throw Errors.InvalidExtensionID\n }\n\n return undefined // Normal behavior\n }\n }\n utils.mockWithProxy(\n window.chrome.runtime,\n 'sendMessage',\n function sendMessage() {},\n sendMessageHandler\n )\n\n /**\n * Mock `chrome.runtime.connect`\n *\n * @see https://developer.chrome.com/apps/runtime#method-connect\n */\n const connectHandler = {\n apply: function(target, ctx, args) {\n const [extensionId, connectInfo] = args || []\n\n // Define custom errors\n const errorPreamble = `Error in invocation of runtime.connect(optional string extensionId, optional object connectInfo): `\n const Errors = makeCustomRuntimeErrors(\n errorPreamble,\n `chrome.runtime.connect()`,\n extensionId\n )\n\n // Behavior differs a bit from sendMessage:\n const noArguments = args.length === 0\n const emptyStringArgument = args.length === 1 && extensionId === ''\n if (noArguments || emptyStringArgument) {\n throw Errors.MustSpecifyExtensionID\n }\n\n const tooManyArguments = args.length > 2\n const incorrectConnectInfoType =\n connectInfo && typeof connectInfo !== 'object'\n\n if (tooManyArguments || incorrectConnectInfoType) {\n throw Errors.NoMatchingSignature\n }\n\n const extensionIdIsString = typeof extensionId === 'string'\n if (extensionIdIsString && extensionId === '') {\n throw Errors.MustSpecifyExtensionID\n }\n if (extensionIdIsString && !isValidExtensionID(extensionId)) {\n throw Errors.InvalidExtensionID\n }\n\n // There's another edge-case here: extensionId is optional so we might find a connectInfo object as first param, which we need to validate\n const validateConnectInfo = ci => {\n // More than a first param connectInfo as been provided\n if (args.length > 1) {\n throw Errors.NoMatchingSignature\n }\n // An empty connectInfo has been provided\n if (Object.keys(ci).length === 0) {\n throw Errors.MustSpecifyExtensionID\n }\n // Loop over all connectInfo props an check them\n Object.entries(ci).forEach(([k, v]) => {\n const isExpected = ['name', 'includeTlsChannelId'].includes(k)\n if (!isExpected) {\n throw new TypeError(\n errorPreamble + `Unexpected property: '${k}'.`\n )\n }\n const MismatchError = (propName, expected, found) =>\n TypeError(\n errorPreamble +\n `Error at property '${propName}': Invalid type: expected ${expected}, found ${found}.`\n )\n if (k === 'name' && typeof v !== 'string') {\n throw MismatchError(k, 'string', typeof v)\n }\n if (k === 'includeTlsChannelId' && typeof v !== 'boolean') {\n throw MismatchError(k, 'boolean', typeof v)\n }\n })\n }\n if (typeof extensionId === 'object') {\n validateConnectInfo(extensionId)\n throw Errors.MustSpecifyExtensionID\n }\n\n // Unfortunately even when the connect fails Chrome will return an object with methods we need to mock as well\n return utils.patchToStringNested(makeConnectResponse())\n }\n }\n utils.mockWithProxy(\n window.chrome.runtime,\n 'connect',\n function connect() {},\n connectHandler\n )\n\n function makeConnectResponse() {\n const onSomething = () => ({\n addListener: function addListener() {},\n dispatch: function dispatch() {},\n hasListener: function hasListener() {},\n hasListeners: function hasListeners() {\n return false\n },\n removeListener: function removeListener() {}\n })\n\n const response = {\n name: '',\n sender: undefined,\n disconnect: function disconnect() {},\n onDisconnect: onSomething(),\n onMessage: onSomething(),\n postMessage: function postMessage() {\n if (!arguments.length) {\n throw new TypeError(`Insufficient number of arguments.`)\n }\n throw new Error(`Attempting to use a disconnected port object`)\n }\n }\n return response\n }\n }",_args:[{opts:{runOnInsecureOrigins:!1},STATIC_DATA:{OnInstalledReason:{CHROME_UPDATE:"chrome_update",INSTALL:"install",SHARED_MODULE_UPDATE:"shared_module_update",UPDATE:"update"},OnRestartRequiredReason:{APP_UPDATE:"app_update",OS_UPDATE:"os_update",PERIODIC:"periodic"},PlatformArch:{ARM:"arm",ARM64:"arm64",MIPS:"mips",MIPS64:"mips64",X86_32:"x86-32",X86_64:"x86-64"},PlatformNaclArch:{ARM:"arm",MIPS:"mips",MIPS64:"mips64",X86_32:"x86-32",X86_64:"x86-64"},PlatformOs:{ANDROID:"android",CROS:"cros",LINUX:"linux",MAC:"mac",OPENBSD:"openbsd",WIN:"win"},RequestUpdateCheckStatus:{NO_UPDATE:"no_update",THROTTLED:"throttled",UPDATE_AVAILABLE:"update_available"}}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"utils => {\n /**\n * Input might look funky, we need to normalize it so e.g. whitespace isn't an issue for our spoofing.\n *\n * @example\n * video/webm; codecs=\"vp8, vorbis\"\n * video/mp4; codecs=\"avc1.42E01E\"\n * audio/x-m4a;\n * audio/ogg; codecs=\"vorbis\"\n * @param {String} arg\n */\n const parseInput = arg => {\n const [mime, codecStr] = arg.trim().split(';')\n let codecs = []\n if (codecStr && codecStr.includes('codecs=\"')) {\n codecs = codecStr\n .trim()\n .replace(`codecs=\"`, '')\n .replace(`\"`, '')\n .trim()\n .split(',')\n .filter(x => !!x)\n .map(x => x.trim())\n }\n return {\n mime,\n codecStr,\n codecs\n }\n }\n\n const canPlayType = {\n // Intercept certain requests\n apply: function(target, ctx, args) {\n if (!args || !args.length) {\n return target.apply(ctx, args)\n }\n const { mime, codecs } = parseInput(args[0])\n // This specific mp4 codec is missing in Chromium\n if (mime === 'video/mp4') {\n if (codecs.includes('avc1.42E01E')) {\n return 'probably'\n }\n }\n // This mimetype is only supported if no codecs are specified\n if (mime === 'audio/x-m4a' && !codecs.length) {\n return 'maybe'\n }\n\n // This mimetype is only supported if no codecs are specified\n if (mime === 'audio/aac' && !codecs.length) {\n return 'probably'\n }\n // Everything else as usual\n return target.apply(ctx, args)\n }\n }\n\n /* global HTMLMediaElement */\n utils.replaceWithProxy(\n HTMLMediaElement.prototype,\n 'canPlayType',\n canPlayType\n )\n }",_args:[]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts }) => {\n utils.replaceGetterWithProxy(\n Object.getPrototypeOf(navigator),\n 'hardwareConcurrency',\n utils.makeHandler().getterValue(opts.hardwareConcurrency)\n )\n }",_args:[{opts:{hardwareConcurrency:4}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { opts }) => {\n const languages = opts.languages.length\n ? opts.languages\n : ['en-US', 'en']\n utils.replaceGetterWithProxy(\n Object.getPrototypeOf(navigator),\n 'languages',\n utils.makeHandler().getterValue(Object.freeze([...languages]))\n )\n }",_args:[{opts:{languages:[]}}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, opts) => {\n const isSecure = document.location.protocol.startsWith('https')\n\n // In headful on secure origins the permission should be \"default\", not \"denied\"\n if (isSecure) {\n utils.replaceGetterWithProxy(Notification, 'permission', {\n apply() {\n return 'default'\n }\n })\n }\n\n // Another weird behavior:\n // On insecure origins in headful the state is \"denied\",\n // whereas in headless it's \"prompt\"\n if (!isSecure) {\n const handler = {\n apply(target, ctx, args) {\n const param = (args || [])[0]\n\n const isNotifications =\n param && param.name && param.name === 'notifications'\n if (!isNotifications) {\n return utils.cache.Reflect.apply(...arguments)\n }\n\n return Promise.resolve(\n Object.setPrototypeOf(\n {\n state: 'denied',\n onchange: null\n },\n PermissionStatus.prototype\n )\n )\n }\n }\n // Note: Don't use `Object.getPrototypeOf` here\n utils.replaceWithProxy(Permissions.prototype, 'query', handler)\n }\n }",_args:[{}]}),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, { fns, data }) => {\n fns = utils.materializeFns(fns)\n\n // That means we're running headful\n const hasPlugins = 'plugins' in navigator && navigator.plugins.length\n if (hasPlugins) {\n return // nothing to do here\n }\n\n const mimeTypes = fns.generateMimeTypeArray(utils, fns)(data.mimeTypes)\n const plugins = fns.generatePluginArray(utils, fns)(data.plugins)\n\n // Plugin and MimeType cross-reference each other, let's do that now\n // Note: We're looping through `data.plugins` here, not the generated `plugins`\n for (const pluginData of data.plugins) {\n pluginData.__mimeTypes.forEach((type, index) => {\n plugins[pluginData.name][index] = mimeTypes[type]\n\n Object.defineProperty(plugins[pluginData.name], type, {\n value: mimeTypes[type],\n writable: false,\n enumerable: false, // Not enumerable\n configurable: true\n })\n Object.defineProperty(mimeTypes[type], 'enabledPlugin', {\n value:\n type === 'application/x-pnacl'\n ? mimeTypes['application/x-nacl'].enabledPlugin // these reference the same plugin, so we need to re-use the Proxy in order to avoid leaks\n : new Proxy(plugins[pluginData.name], {}), // Prevent circular references\n writable: false,\n enumerable: false, // Important: `JSON.stringify(navigator.plugins)`\n configurable: true\n })\n })\n }\n\n const patchNavigator = (name, value) =>\n utils.replaceProperty(Object.getPrototypeOf(navigator), name, {\n get() {\n return value\n }\n })\n\n patchNavigator('mimeTypes', mimeTypes)\n patchNavigator('plugins', plugins)\n\n // All done\n }",_args:[{fns:{generateMimeTypeArray:"(utils, fns) => mimeTypesData => {\n return fns.generateMagicArray(utils, fns)(\n mimeTypesData,\n MimeTypeArray.prototype,\n MimeType.prototype,\n 'type'\n )\n}",generatePluginArray:"(utils, fns) => pluginsData => {\n return fns.generateMagicArray(utils, fns)(\n pluginsData,\n PluginArray.prototype,\n Plugin.prototype,\n 'name'\n )\n}",generateMagicArray:"(utils, fns) =>\n function(\n dataArray = [],\n proto = MimeTypeArray.prototype,\n itemProto = MimeType.prototype,\n itemMainProp = 'type'\n ) {\n // Quick helper to set props with the same descriptors vanilla is using\n const defineProp = (obj, prop, value) =>\n Object.defineProperty(obj, prop, {\n value,\n writable: false,\n enumerable: false, // Important for mimeTypes & plugins: `JSON.stringify(navigator.mimeTypes)`\n configurable: true\n })\n\n // Loop over our fake data and construct items\n const makeItem = data => {\n const item = {}\n for (const prop of Object.keys(data)) {\n if (prop.startsWith('__')) {\n continue\n }\n defineProp(item, prop, data[prop])\n }\n return patchItem(item, data)\n }\n\n const patchItem = (item, data) => {\n let descriptor = Object.getOwnPropertyDescriptors(item)\n\n // Special case: Plugins have a magic length property which is not enumerable\n // e.g. `navigator.plugins[i].length` should always be the length of the assigned mimeTypes\n if (itemProto === Plugin.prototype) {\n descriptor = {\n ...descriptor,\n length: {\n value: data.__mimeTypes.length,\n writable: false,\n enumerable: false,\n configurable: true // Important to be able to use the ownKeys trap in a Proxy to strip `length`\n }\n }\n }\n\n // We need to spoof a specific `MimeType` or `Plugin` object\n const obj = Object.create(itemProto, descriptor)\n\n // Virtually all property keys are not enumerable in vanilla\n const blacklist = [...Object.keys(data), 'length', 'enabledPlugin']\n return new Proxy(obj, {\n ownKeys(target) {\n return Reflect.ownKeys(target).filter(k => !blacklist.includes(k))\n },\n getOwnPropertyDescriptor(target, prop) {\n if (blacklist.includes(prop)) {\n return undefined\n }\n return Reflect.getOwnPropertyDescriptor(target, prop)\n }\n })\n }\n\n const magicArray = []\n\n // Loop through our fake data and use that to create convincing entities\n dataArray.forEach(data => {\n magicArray.push(makeItem(data))\n })\n\n // Add direct property access based on types (e.g. `obj['application/pdf']`) afterwards\n magicArray.forEach(entry => {\n defineProp(magicArray, entry[itemMainProp], entry)\n })\n\n // This is the best way to fake the type to make sure this is false: `Array.isArray(navigator.mimeTypes)`\n const magicArrayObj = Object.create(proto, {\n ...Object.getOwnPropertyDescriptors(magicArray),\n\n // There's one ugly quirk we unfortunately need to take care of:\n // The `MimeTypeArray` prototype has an enumerable `length` property,\n // but headful Chrome will still skip it when running `Object.getOwnPropertyNames(navigator.mimeTypes)`.\n // To strip it we need to make it first `configurable` and can then overlay a Proxy with an `ownKeys` trap.\n length: {\n value: magicArray.length,\n writable: false,\n enumerable: false,\n configurable: true // Important to be able to use the ownKeys trap in a Proxy to strip `length`\n }\n })\n\n // Generate our functional function mocks :-)\n const functionMocks = fns.generateFunctionMocks(utils)(\n proto,\n itemMainProp,\n magicArray\n )\n\n // We need to overlay our custom object with a JS Proxy\n const magicArrayObjProxy = new Proxy(magicArrayObj, {\n get(target, key = '') {\n // Redirect function calls to our custom proxied versions mocking the vanilla behavior\n if (key === 'item') {\n return functionMocks.item\n }\n if (key === 'namedItem') {\n return functionMocks.namedItem\n }\n if (proto === PluginArray.prototype && key === 'refresh') {\n return functionMocks.refresh\n }\n // Everything else can pass through as normal\n return utils.cache.Reflect.get(...arguments)\n },\n ownKeys(target) {\n // There are a couple of quirks where the original property demonstrates \"magical\" behavior that makes no sense\n // This can be witnessed when calling `Object.getOwnPropertyNames(navigator.mimeTypes)` and the absense of `length`\n // My guess is that it has to do with the recent change of not allowing data enumeration and this being implemented weirdly\n // For that reason we just completely fake the available property names based on our data to match what regular Chrome is doing\n // Specific issues when not patching this: `length` property is available, direct `types` props (e.g. `obj['application/pdf']`) are missing\n const keys = []\n const typeProps = magicArray.map(mt => mt[itemMainProp])\n typeProps.forEach((_, i) => keys.push(`${i}`))\n typeProps.forEach(propName => keys.push(propName))\n return keys\n },\n getOwnPropertyDescriptor(target, prop) {\n if (prop === 'length') {\n return undefined\n }\n return Reflect.getOwnPropertyDescriptor(target, prop)\n }\n })\n\n return magicArrayObjProxy\n }",generateFunctionMocks:"utils => (\n proto,\n itemMainProp,\n dataArray\n) => ({\n /** Returns the MimeType object with the specified index. */\n item: utils.createProxy(proto.item, {\n apply(target, ctx, args) {\n if (!args.length) {\n throw new TypeError(\n `Failed to execute 'item' on '${\n proto[Symbol.toStringTag]\n }': 1 argument required, but only 0 present.`\n )\n }\n // Special behavior alert:\n // - Vanilla tries to cast strings to Numbers (only integers!) and use them as property index lookup\n // - If anything else than an integer (including as string) is provided it will return the first entry\n const isInteger = args[0] && Number.isInteger(Number(args[0])) // Cast potential string to number first, then check for integer\n // Note: Vanilla never returns `undefined`\n return (isInteger ? dataArray[Number(args[0])] : dataArray[0]) || null\n }\n }),\n /** Returns the MimeType object with the specified name. */\n namedItem: utils.createProxy(proto.namedItem, {\n apply(target, ctx, args) {\n if (!args.length) {\n throw new TypeError(\n `Failed to execute 'namedItem' on '${\n proto[Symbol.toStringTag]\n }': 1 argument required, but only 0 present.`\n )\n }\n return dataArray.find(mt => mt[itemMainProp] === args[0]) || null // Not `undefined`!\n }\n }),\n /** Does nothing and shall return nothing */\n refresh: proto.refresh\n ? utils.createProxy(proto.refresh, {\n apply(target, ctx, args) {\n return undefined\n }\n })\n : undefined\n})"},data:{mimeTypes:[{type:"application/pdf",suffixes:"pdf",description:"",__pluginName:"Chrome PDF Viewer"},{type:"application/x-google-chrome-pdf",suffixes:"pdf",description:"Portable Document Format",__pluginName:"Chrome PDF Plugin"},{type:"application/x-nacl",suffixes:"",description:"Native Client Executable",__pluginName:"Native Client"},{type:"application/x-pnacl",suffixes:"",description:"Portable Native Client Executable",__pluginName:"Native Client"}],plugins:[{name:"Chrome PDF Plugin",filename:"internal-pdf-viewer",description:"Portable Document Format",__mimeTypes:["application/x-google-chrome-pdf"]},{name:"Chrome PDF Viewer",filename:"mhjfbmdgcfjbbpaeojofohoefgiehjai",description:"",__mimeTypes:["application/pdf"]},{name:"Native Client",filename:"internal-nacl-plugin",description:"",__mimeTypes:["application/x-nacl","application/x-pnacl"]}]}}]}),!1===navigator.webdriver||void 0===navigator.webdriver||delete Object.getPrototypeOf(navigator).webdriver,(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, opts) => {\n const getParameterProxyHandler = {\n apply: function(target, ctx, args) {\n const param = (args || [])[0]\n const result = utils.cache.Reflect.apply(target, ctx, args)\n // UNMASKED_VENDOR_WEBGL\n if (param === 37445) {\n return opts.vendor || 'Intel Inc.' // default in headless: Google Inc.\n }\n // UNMASKED_RENDERER_WEBGL\n if (param === 37446) {\n return opts.renderer || 'Intel Iris OpenGL Engine' // default in headless: Google SwiftShader\n }\n return result\n }\n }\n\n // There's more than one WebGL rendering context\n // https://developer.mozilla.org/en-US/docs/Web/API/WebGL2RenderingContext#Browser_compatibility\n // To find out the original values here: Object.getOwnPropertyDescriptors(WebGLRenderingContext.prototype.getParameter)\n const addProxy = (obj, propName) => {\n utils.replaceWithProxy(obj, propName, getParameterProxyHandler)\n }\n // For whatever weird reason loops don't play nice with Object.defineProperty, here's the next best thing:\n addProxy(WebGLRenderingContext.prototype, 'getParameter')\n addProxy(WebGL2RenderingContext.prototype, 'getParameter')\n }",_args:[{}]}),(()=>{try{if(window.outerWidth&&window.outerHeight)return;const n=85;window.outerWidth=window.innerWidth,window.outerHeight=window.innerHeight+n}catch(n){}})(),(({_utilsFns:_utilsFns,_mainFunction:_mainFunction,_args:_args})=>{const utils=Object.fromEntries(Object.entries(_utilsFns).map((([key,value])=>[key,eval(value)])));utils.init(),eval(_mainFunction)(utils,..._args)})({_utilsFns:{init:"() => {\n utils.preloadCache()\n}",stripProxyFromErrors:"(handler = {}) => {\n const newHandler = {\n setPrototypeOf: function (target, proto) {\n if (proto === null)\n throw new TypeError('Cannot convert object to primitive value')\n if (Object.getPrototypeOf(target) === Object.getPrototypeOf(proto)) {\n throw new TypeError('Cyclic __proto__ value')\n }\n return Reflect.setPrototypeOf(target, proto)\n }\n }\n // We wrap each trap in the handler in a try/catch and modify the error stack if they throw\n const traps = Object.getOwnPropertyNames(handler)\n traps.forEach(trap => {\n newHandler[trap] = function () {\n try {\n // Forward the call to the defined proxy handler\n return handler[trap].apply(this, arguments || [])\n } catch (err) {\n // Stack traces differ per browser, we only support chromium based ones currently\n if (!err || !err.stack || !err.stack.includes(`at `)) {\n throw err\n }\n\n // When something throws within one of our traps the Proxy will show up in error stacks\n // An earlier implementation of this code would simply strip lines with a blacklist,\n // but it makes sense to be more surgical here and only remove lines related to our Proxy.\n // We try to use a known \"anchor\" line for that and strip it with everything above it.\n // If the anchor line cannot be found for some reason we fall back to our blacklist approach.\n\n const stripWithBlacklist = (stack, stripFirstLine = true) => {\n const blacklist = [\n `at Reflect.${trap} `, // e.g. Reflect.get or Reflect.apply\n `at Object.${trap} `, // e.g. Object.get or Object.apply\n `at Object.newHandler. [as ${trap}] ` // caused by this very wrapper :-)\n ]\n return (\n err.stack\n .split('\\n')\n // Always remove the first (file) line in the stack (guaranteed to be our proxy)\n .filter((line, index) => !(index === 1 && stripFirstLine))\n // Check if the line starts with one of our blacklisted strings\n .filter(line => !blacklist.some(bl => line.trim().startsWith(bl)))\n .join('\\n')\n )\n }\n\n const stripWithAnchor = (stack, anchor) => {\n const stackArr = stack.split('\\n')\n anchor = anchor || `at Object.newHandler. [as ${trap}] ` // Known first Proxy line in chromium\n const anchorIndex = stackArr.findIndex(line =>\n line.trim().startsWith(anchor)\n )\n if (anchorIndex === -1) {\n return false // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n return stackArr.join('\\n')\n }\n\n // Special cases due to our nested toString proxies\n err.stack = err.stack.replace(\n 'at Object.toString (',\n 'at Function.toString ('\n )\n if ((err.stack || '').includes('at Function.toString (')) {\n err.stack = stripWithBlacklist(err.stack, false)\n throw err\n }\n\n // Try using the anchor method, fallback to blacklist if necessary\n err.stack = stripWithAnchor(err.stack) || stripWithBlacklist(err.stack)\n\n throw err // Re-throw our now sanitized error\n }\n }\n })\n return newHandler\n}",stripErrorWithAnchor:"(err, anchor) => {\n const stackArr = err.stack.split('\\n')\n const anchorIndex = stackArr.findIndex(line => line.trim().startsWith(anchor))\n if (anchorIndex === -1) {\n return err // 404, anchor not found\n }\n // Strip everything from the top until we reach the anchor line (remove anchor line as well)\n // Note: We're keeping the 1st line (zero index) as it's unrelated (e.g. `TypeError`)\n stackArr.splice(1, anchorIndex)\n err.stack = stackArr.join('\\n')\n return err\n}",replaceProperty:"(obj, propName, descriptorOverrides = {}) => {\n return Object.defineProperty(obj, propName, {\n // Copy over the existing descriptors (writable, enumerable, configurable, etc)\n ...(Object.getOwnPropertyDescriptor(obj, propName) || {}),\n // Add our overrides (e.g. value, get())\n ...descriptorOverrides\n })\n}",preloadCache:"() => {\n if (utils.cache) {\n return\n }\n utils.cache = {\n // Used in our proxies\n Reflect: {\n get: Reflect.get.bind(Reflect),\n apply: Reflect.apply.bind(Reflect)\n },\n // Used in `makeNativeString`\n nativeToStringStr: Function.toString + '' // => `function toString() { [native code] }`\n }\n}",makeNativeString:"(name = '') => {\n return utils.cache.nativeToStringStr.replace('toString', name || '')\n}",patchToString:"(obj, str = '') => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n // `toString` targeted at our proxied Object detected\n if (ctx === obj) {\n // We either return the optional string verbatim or derive the most desired result automatically\n return str || utils.makeNativeString(obj.name)\n }\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",patchToStringNested:"(obj = {}) => {\n return utils.execRecursively(obj, ['function'], utils.patchToString)\n}",redirectToString:"(proxyObj, originalObj) => {\n const handler = {\n apply: function (target, ctx) {\n // This fixes e.g. `HTMLMediaElement.prototype.canPlayType.toString + \"\"`\n if (ctx === Function.prototype.toString) {\n return utils.makeNativeString('toString')\n }\n\n // `toString` targeted at our proxied Object detected\n if (ctx === proxyObj) {\n const fallback = () =>\n originalObj && originalObj.name\n ? utils.makeNativeString(originalObj.name)\n : utils.makeNativeString(proxyObj.name)\n\n // Return the toString representation of our original object if possible\n return originalObj + '' || fallback()\n }\n\n if (typeof ctx === 'undefined' || ctx === null) {\n return target.call(ctx)\n }\n\n // Check if the toString protype of the context is the same as the global prototype,\n // if not indicates that we are doing a check across different windows., e.g. the iframeWithdirect` test case\n const hasSameProto = Object.getPrototypeOf(\n Function.prototype.toString\n ).isPrototypeOf(ctx.toString) // eslint-disable-line no-prototype-builtins\n if (!hasSameProto) {\n // Pass the call on to the local Function.prototype.toString instead\n return ctx.toString()\n }\n\n return target.call(ctx)\n }\n }\n\n const toStringProxy = new Proxy(\n Function.prototype.toString,\n utils.stripProxyFromErrors(handler)\n )\n utils.replaceProperty(Function.prototype, 'toString', {\n value: toStringProxy\n })\n}",replaceWithProxy:"(obj, propName, handler) => {\n const originalObj = obj[propName]\n const proxyObj = new Proxy(obj[propName], utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.redirectToString(proxyObj, originalObj)\n\n return true\n}",replaceGetterWithProxy:"(obj, propName, handler) => {\n const fn = Object.getOwnPropertyDescriptor(obj, propName).get\n const fnStr = fn.toString() // special getter function string\n const proxyObj = new Proxy(fn, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { get: proxyObj })\n utils.patchToString(proxyObj, fnStr)\n\n return true\n}",replaceGetterSetter:"(obj, propName, handlerGetterSetter) => {\n const ownPropertyDescriptor = Object.getOwnPropertyDescriptor(obj, propName)\n const handler = { ...ownPropertyDescriptor }\n\n if (handlerGetterSetter.get !== undefined) {\n const nativeFn = ownPropertyDescriptor.get\n handler.get = function() {\n return handlerGetterSetter.get.call(this, nativeFn.bind(this))\n }\n utils.redirectToString(handler.get, nativeFn)\n }\n\n if (handlerGetterSetter.set !== undefined) {\n const nativeFn = ownPropertyDescriptor.set\n handler.set = function(newValue) {\n handlerGetterSetter.set.call(this, newValue, nativeFn.bind(this))\n }\n utils.redirectToString(handler.set, nativeFn)\n }\n\n Object.defineProperty(obj, propName, handler)\n}",mockWithProxy:"(obj, propName, pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n\n utils.replaceProperty(obj, propName, { value: proxyObj })\n utils.patchToString(proxyObj)\n\n return true\n}",createProxy:"(pseudoTarget, handler) => {\n const proxyObj = new Proxy(pseudoTarget, utils.stripProxyFromErrors(handler))\n utils.patchToString(proxyObj)\n\n return proxyObj\n}",splitObjPath:"objPath => ({\n // Remove last dot entry (property) ==> `HTMLMediaElement.prototype`\n objName: objPath.split('.').slice(0, -1).join('.'),\n // Extract last dot entry ==> `canPlayType`\n propName: objPath.split('.').slice(-1)[0]\n})",replaceObjPathWithProxy:"(objPath, handler) => {\n const { objName, propName } = utils.splitObjPath(objPath)\n const obj = eval(objName) // eslint-disable-line no-eval\n return utils.replaceWithProxy(obj, propName, handler)\n}",execRecursively:"(obj = {}, typeFilter = [], fn) => {\n function recurse(obj) {\n for (const key in obj) {\n if (obj[key] === undefined) {\n continue\n }\n if (obj[key] && typeof obj[key] === 'object') {\n recurse(obj[key])\n } else {\n if (obj[key] && typeFilter.includes(typeof obj[key])) {\n fn.call(this, obj[key])\n }\n }\n }\n }\n recurse(obj)\n return obj\n}",stringifyFns:"(fnObj = { hello: () => 'world' }) => {\n // Object.fromEntries() ponyfill (in 6 lines) - supported only in Node v12+, modern browsers are fine\n // https://github.com/feross/fromentries\n function fromEntries(iterable) {\n return [...iterable].reduce((obj, [key, val]) => {\n obj[key] = val\n return obj\n }, {})\n }\n return (Object.fromEntries || fromEntries)(\n Object.entries(fnObj)\n .filter(([key, value]) => typeof value === 'function')\n .map(([key, value]) => [key, value.toString()]) // eslint-disable-line no-eval\n )\n}",materializeFns:"(fnStrObj = { hello: \"() => 'world'\" }) => {\n return Object.fromEntries(\n Object.entries(fnStrObj).map(([key, value]) => {\n if (value.startsWith('function')) {\n // some trickery is needed to make oldschool functions work :-)\n return [key, eval(`() => ${value}`)()] // eslint-disable-line no-eval\n } else {\n // arrow functions just work\n return [key, eval(value)] // eslint-disable-line no-eval\n }\n })\n )\n}",makeHandler:"() => ({\n // Used by simple `navigator` getter evasions\n getterValue: value => ({\n apply(target, ctx, args) {\n // Let's fetch the value first, to trigger and escalate potential errors\n // Illegal invocations like `navigator.__proto__.vendor` will throw here\n utils.cache.Reflect.apply(...arguments)\n return value\n }\n })\n})",arrayEquals:"(array1, array2) => {\n if (array1.length !== array2.length) {\n return false\n }\n for (let i = 0; i < array1.length; ++i) {\n if (array1[i] !== array2[i]) {\n return false\n }\n }\n return true\n}",memoize:"fn => {\n const cache = []\n return function(...args) {\n if (!cache.some(c => utils.arrayEquals(c.key, args))) {\n cache.push({ key: args, value: fn.apply(this, args) })\n }\n return cache.find(c => utils.arrayEquals(c.key, args)).value\n }\n}"},_mainFunction:"(utils, opts) => {\n try {\n // Adds a contentWindow proxy to the provided iframe element\n const addContentWindowProxy = iframe => {\n const contentWindowProxy = {\n get(target, key) {\n // Now to the interesting part:\n // We actually make this thing behave like a regular iframe window,\n // by intercepting calls to e.g. `.self` and redirect it to the correct thing. :)\n // That makes it possible for these assertions to be correct:\n // iframe.contentWindow.self === window.top // must be false\n if (key === 'self') {\n return this\n }\n // iframe.contentWindow.frameElement === iframe // must be true\n if (key === 'frameElement') {\n return iframe\n }\n // Intercept iframe.contentWindow[0] to hide the property 0 added by the proxy.\n if (key === '0') {\n return undefined\n }\n return Reflect.get(target, key)\n }\n }\n\n if (!iframe.contentWindow) {\n const proxy = new Proxy(window, contentWindowProxy)\n Object.defineProperty(iframe, 'contentWindow', {\n get() {\n return proxy\n },\n set(newValue) {\n return newValue // contentWindow is immutable\n },\n enumerable: true,\n configurable: false\n })\n }\n }\n\n // Handles iframe element creation, augments `srcdoc` property so we can intercept further\n const handleIframeCreation = (target, thisArg, args) => {\n const iframe = target.apply(thisArg, args)\n\n // We need to keep the originals around\n const _iframe = iframe\n const _srcdoc = _iframe.srcdoc\n\n // Add hook for the srcdoc property\n // We need to be very surgical here to not break other iframes by accident\n Object.defineProperty(iframe, 'srcdoc', {\n configurable: true, // Important, so we can reset this later\n get: function() {\n return _srcdoc\n },\n set: function(newValue) {\n addContentWindowProxy(this)\n // Reset property, the hook is only needed once\n Object.defineProperty(iframe, 'srcdoc', {\n configurable: false,\n writable: false,\n value: _srcdoc\n })\n _iframe.srcdoc = newValue\n }\n })\n return iframe\n }\n\n // Adds a hook to intercept iframe creation events\n const addIframeCreationSniffer = () => {\n /* global document */\n const createElementHandler = {\n // Make toString() native\n get(target, key) {\n return Reflect.get(target, key)\n },\n apply: function(target, thisArg, args) {\n const isIframe =\n args && args.length && `${args[0]}`.toLowerCase() === 'iframe'\n if (!isIframe) {\n // Everything as usual\n return target.apply(thisArg, args)\n } else {\n return handleIframeCreation(target, thisArg, args)\n }\n }\n }\n // All this just due to iframes with srcdoc bug\n utils.replaceWithProxy(\n document,\n 'createElement',\n createElementHandler\n )\n }\n\n // Let's go\n addIframeCreationSniffer()\n } catch (err) {\n // console.warn(err)\n }\n }",_args:[]}); \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/social_media/stocktwits_streaming.py b/FinNLP/finnlp/data_sources/social_media/stocktwits_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..c81e7455f2d6451c069c1edaf719526b461a4cfc --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/stocktwits_streaming.py @@ -0,0 +1,50 @@ +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +import requests +import pandas as pd +from tqdm import tqdm +import json + +class Stocktwits_Streaming(Social_Media_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_stock(self, stock = "AAPL", rounds = 3): + url = f"https://api.stocktwits.com/api/2/streams/symbol/{stock}.json" + headers = { + 'accept': 'application/json', + 'accept-encoding': 'gzip, deflate, br', + 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', + 'authorization': 'OAuth 8a881f43cbc7af061ec2aa35deec9b44f7e3cc09', + 'dnt': '1', + 'origin': 'https://stocktwits.com', + 'referer': 'https://stocktwits.com/', + + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', + } + for i in tqdm(range(rounds)): + if i == 0: + params = { + "filter":"top", + "limit":1000, + # "max":410000000, + } + else: + params = { + "filter":"top", + "limit":1000, + "max":max, + } + response = self._request_get(url = url, headers=headers, params=params) + if response is None: + print(f"Fetch data fail. Please check your stock name :{stock} and connections. You may raise an issue if you can't solve this problem") + continue + else: + res = json.loads(response.text) + max = res["cursor"]["since"] + res = pd.DataFrame(res["messages"]) + self.dataframe = pd.concat([self.dataframe,res]) + + self.dataframe = self.dataframe.reset_index(drop = True) diff --git a/FinNLP/finnlp/data_sources/social_media/twitter_date_range.py b/FinNLP/finnlp/data_sources/social_media/twitter_date_range.py new file mode 100644 index 0000000000000000000000000000000000000000..cd9bc4f9009858e51594a210ae8bbc0bd9cab903 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/twitter_date_range.py @@ -0,0 +1,76 @@ +import warnings +warnings.filterwarnings("ignore") + +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +import requests +from urllib import parse +from tqdm import tqdm +from datetime import datetime,timedelta +import pandas as pd +import json +import time + +class Twitter_Date_Range(Social_Media_Downloader): + + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): + self.date_list = pd.date_range(start_date,end_date) + res = pd.DataFrame() + for date in tqdm(self.date_list, desc= "Downloading by day... "): + tmp = self._gather_one_day(date,stock) + res = pd.concat([res,tmp]) + + res.created_at = pd.to_datetime(res.created_at) + res = res.sort_values("created_at") + res = res.reset_index(drop=True) + # res = res.query(f"created_at >= @start_date & created_at <= @end_date") + res = res[res.created_at >= start_date][res.created_at <= end_date] + res = res.reset_index(drop=True) + self.dataframe = res + + def _gather_one_day(self, date, stock = "AAPL", pbar = None ,delay = 0.01): + time.sleep(delay) + next_date = date + timedelta(days=1) + date = datetime.strftime(date, "%Y-%m-%d") + next_date = datetime.strftime(next_date, "%Y-%m-%d") + + url = "https://twitter.com/i/api/2/search/adaptive.json?include_profile_interstitial_type=1&include_blocking=1&include_blocked_by=1&include_followed_by=1&include_want_retweets=1&include_mute_edge=1&include_can_dm=1&include_can_media_tag=1&skip_status=1&cards_platform=Web-12&include_cards=1&include_ext_alt_text=true&include_quote_count=true&include_reply_count=1&tweet_mode=extended&include_entities=true&include_user_entities=true&include_ext_media_color=true&include_ext_media_availability=true&send_error_codes=true&simple_quoted_tweet=true&q={}&count=20&query_source=typed_query&pc=1&spelling_corrections=1&ext=mediaStats%2ChighlightedLabel%2CvoiceInfo" + url_token = 'https://api.twitter.com/1.1/guest/activate.json' + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36', + 'Accept': '*/*', + 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', + 'x-guest-token': '', + 'x-twitter-client-language': 'zh-cn', + 'x-twitter-active-user': 'yes', + 'x-csrf-token': '25ea9d09196a6ba850201d47d7e75733', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', + 'Referer': 'https://twitter.com/', + 'Connection': 'keep-alive', + } + + q = f'{stock} until:{next_date} since:{date}' + token = json.loads(requests.post(url_token, headers = headers).text)['guest_token'] + print(token) + headers['x-guest-token'] = token + url = url.format(parse.quote(q)) + print(url) + res = self._request_get(url, headers = headers) + print(res) + if res is not None: + try: + res = json.loads(res.text) + res = pd.DataFrame(res["globalObjects"]["tweets"]).T.sort_values("created_at") + except: + res = pd.DataFrame() + else: + res = pd.DataFrame() + + return res diff --git a/FinNLP/finnlp/data_sources/social_media/weibo_date_range.py b/FinNLP/finnlp/data_sources/social_media/weibo_date_range.py new file mode 100644 index 0000000000000000000000000000000000000000..8726ed86a5e1462861a5a22a0fe18c57d9b97971 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/weibo_date_range.py @@ -0,0 +1,154 @@ +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +from tqdm import tqdm +from lxml import etree +import pandas as pd +import numpy as np +import requests +import datetime +import time +import json +import re + +class Weibo_Date_Range(Social_Media_Downloader): + def __init__(self, args = {}): + super().__init__(args) + if "cookies" not in args.keys(): + raise ValueError("You need first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ") + self.cookies = args["cookies"] + self.dataframe = pd.DataFrame() + + def download_date_range_stock(self, start_date, end_date, start_hour= 0,end_hour = 0,stock = "茅台", delay = 0.01): + self.date_list = pd.date_range(start_date, end_date) + for date in tqdm(self.date_list, desc = "Downloading by dates..."): + date = date.strftime("%Y-%m-%d") + self._gather_one_day(date, start_hour, end_hour, stock, delay) + self.dataframe = self.dataframe.reset_index(drop = True) + + def _gather_one_day(self,date,start_hour, end_hour, stock = "茅台", delay = 0.01): + if start_hour == 0 and end_hour == 0: + start_date = datetime.datetime.strptime(date, "%Y-%m-%d") + end_date = start_date + datetime.timedelta(days=1) + start_date = start_date.strftime("%Y-%m-%d") + end_date = end_date.strftime("%Y-%m-%d") + else: + start_date = date, end_date = date + + # first page + all_urls = self._gather_first_page(start_date, end_date, start_hour, end_hour, stock, delay) + # another pages + if len(all_urls)>1: + base_url= "https://s.weibo.com/" + for url_new in all_urls: + url_new = base_url + url_new + self._gather_other_pages(date, url_new, delay) + + def _gather_first_page(self,start_date, end_date, start_hour, end_hour, stock = "茅台", delay = 0.01): + + headers = { + "cookie": self.cookies, + "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", + } + + params = { + "q": stock, + "typeall": "1", + "suball": "1", + "timescope":f"custom:{start_date}-{start_hour}:{end_date}-{end_hour}", + "Refer":"g", + "page":"1" + } + + url = f"https://s.weibo.com/weibo" + resp = self._request_get(url, headers=headers, params = params) + + if resp is None: + return "Error" + + if "passport.weibo.com" in resp.url: + raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ") + + res = etree.HTML(resp.content) + # get all pages + all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href') + items = res.xpath('//div[@class="card-wrap"]') + for i in items: + ps = i.xpath('.//div[@class="content"]//p') + try: + content = ps[0].xpath(".//text()") + content = ''.join(content) + content = content.replace('\n',"") + content = content.replace(' ',"") + content = content.replace('\u200b',"") + except: + continue + + info = ps[1].xpath(".//text()") + try: + date_content = info[1] + date_content = date_content.replace('\n',"") + date_content = date_content.replace(' ',"") + except: + date_content = np.nan + + try: + source = info[3] + except: + source = np.nan + + tmp = pd.DataFrame([start_date, date_content, source, content]).T + tmp.columns = ["date","date_content", "source", "content"] + self.dataframe = pd.concat([self.dataframe, tmp]) + + time.sleep(delay) + + return all_pages + + def _gather_other_pages(self, date, url, delay = 0.01): + + headers = { + "cookie": self.cookies, + "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", + } + + resp = self._request_get(url, headers=headers) + + if resp is None: + return "Error" + + if "passport.weibo.com" in resp.url: + raise ValueError("Your cookies is useless. Please first log in at https://weibo.com/ and then copy you cookies and use it as the [value] of [key] \'cookies\' ") + + res = etree.HTML(resp.content) + # get all pages + all_pages = res.xpath('//*[@id="pl_feedlist_index"]/div[3]/div[1]/span/ul/li//@href') + items = res.xpath('//div[@class="card-wrap"]') + for i in items: + ps = i.xpath('.//div[@class="content"]//p') + try: + content = ps[0].xpath(".//text()") + content = ''.join(content) + content = content.replace('\n',"") + content = content.replace(' ',"") + content = content.replace('\u200b',"") + except: + continue + + info = ps[1].xpath(".//text()") + try: + date_content = info[1] + date_content = date_content.replace('\n',"") + date_content = date_content.replace(' ',"") + except: + date_content = np.nan + + try: + source = info[3] + except: + source = np.nan + + tmp = pd.DataFrame([date, date_content, source, content]).T + tmp.columns = ["date", "date_content", "source", "content"] + self.dataframe = pd.concat([self.dataframe, tmp]) + + time.sleep(delay) diff --git a/FinNLP/finnlp/data_sources/social_media/weibo_streaming.py b/FinNLP/finnlp/data_sources/social_media/weibo_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..6e07e1193692deec916a9dffdadcd3bd38190652 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/weibo_streaming.py @@ -0,0 +1,78 @@ +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +from tqdm import tqdm +from lxml import etree +import pandas as pd +import requests +import time +import json +import re + +class Weibo_Streaming(Social_Media_Downloader): + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_stock(self, stock = "茅台", rounds = 3): + for r in tqdm(range(rounds), desc="Downloading by page.."): + page = r+1 + self._gather_one_page(page, stock) + + def _gather_one_page(self,page, stock = "茅台", delay = 0.01): + headers = { + "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" + } + params = { + "containerid": f"100103type=61&q={stock}&t=", + "page_type": "searchall", + "page":page + } + url = f"https://m.weibo.cn/api/container/getIndex" + resp = self._request_get(url, headers=headers, params = params) + + if resp is None: + return "Error" + + res = json.loads(resp.text) + res = res["data"]["cards"] + res = pd.DataFrame(res) + + pbar = tqdm(total = res.shape[0], desc = "Processing the text content and downloading the full passage...") + res[["content_short","content"]] = res.apply(lambda x:self._process_text(x, pbar, delay), axis= 1, result_type= "expand") + + self.dataframe = pd.concat([self.dataframe, res]) + + def _process_text(self,x, pbar, delay = 0.01): + text = x["mblog"]["text"] + text = etree.HTML(text) + content_short = text.xpath(".//text()") + content_short = ''.join(content_short) + + link = text.xpath('.//a/@href') + link = [l for l in link if "status" in l ] + if len(link) >0: + base_url = "https://m.weibo.cn/" + url_new = base_url + link[0] + headers = { + "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0" + } + resp = self._request_get(url_new, headers= headers) + if resp is None: + content = content_short + else: + res = etree.HTML(resp.content) + scripts = res.xpath('//script') + content = scripts[2].xpath("text()") + pattern=re.compile('"text": "(.+),\n') + result = pattern.findall(content[0]) + content = etree.HTML(result[0]) + content = content.xpath("//text()") + content = ''.join(content) + else: + content = content_short + + pbar.update(1) + time.sleep(delay) + + return content_short, content + diff --git a/FinNLP/finnlp/data_sources/social_media/xueqiu_streaming.py b/FinNLP/finnlp/data_sources/social_media/xueqiu_streaming.py new file mode 100644 index 0000000000000000000000000000000000000000..3bbe04175995d29cdb27751837280f75a69f1ed6 --- /dev/null +++ b/FinNLP/finnlp/data_sources/social_media/xueqiu_streaming.py @@ -0,0 +1,67 @@ +import warnings +warnings.filterwarnings("ignore") +import requests +from lxml import etree +from tqdm import tqdm +import pandas as pd +import json +import time +from finnlp.data_sources.social_media._base import Social_Media_Downloader + +# TODO: +# 1. Contents + +class Xueqiu_Streaming(Social_Media_Downloader): + def __init__(self, args = {}): + super().__init__(args) + self.dataframe = pd.DataFrame() + + def download_streaming_stock(self, keyword = "茅台", rounds = 3, delay = 0.5): + # first get cookie + self._get_cookie(keyword = keyword) + + url = "https://xueqiu.com/query/v1/search/status.json" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + } + print("Downloading ...", end = ' ') + for page in range(rounds): + params = { + 'sortId': '2', + 'q': keyword, + 'count': '10', + 'page': page, + } + + res = self.session.get(url = url, headers= headers, params = params) + if res.status_code != 200: + break + + res = json.loads(res.text) + tmp = pd.DataFrame(res["list"]) + self.dataframe = pd.concat([self.dataframe, tmp]) + + print(page, end = ' ') + + time.sleep(delay) + + self.dataframe["created_at"] = pd.to_datetime(self.dataframe["created_at"], unit = 'ms') + + + def _get_cookie(self, keyword = "茅台"): + first_url = "https://xueqiu.com/k" + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', + } + params = { + 'q': keyword + } + + self.session = requests.session() + + res = self.session.get(headers = headers, url = first_url, params=params) + if res.status_code != 200: + print(f"Connection Error: {res.status_code}") + return f"Connection Error: {res.status_code}" + + diff --git a/FinNLP/finnlp/data_sources/trends/__init__.py b/FinNLP/finnlp/data_sources/trends/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/data_sources/trends/_base.py b/FinNLP/finnlp/data_sources/trends/_base.py new file mode 100644 index 0000000000000000000000000000000000000000..0988ac6c84c3c19ea0380872f1178ea4b2e3e8d0 --- /dev/null +++ b/FinNLP/finnlp/data_sources/trends/_base.py @@ -0,0 +1,16 @@ +class Trend_Downloader: + + def __init__(self, args = {}): + pass + + def download(self, start_date, end_date, stock = "all"): + pass + + def clean_data(self): + pass + + def gather_one_day(self,date,stock = "all",delay = 0.1): + pass + + def transfer_standard_date_to_nonstandard(self,date): + pass \ No newline at end of file diff --git a/FinNLP/finnlp/data_sources/trends/baidu.py b/FinNLP/finnlp/data_sources/trends/baidu.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/data_sources/trends/google.py b/FinNLP/finnlp/data_sources/trends/google.py new file mode 100644 index 0000000000000000000000000000000000000000..0ea8aa64882b63627063c513a449398601c64d6f --- /dev/null +++ b/FinNLP/finnlp/data_sources/trends/google.py @@ -0,0 +1,16 @@ +from pytrends.request import TrendReq +import pandas as pd + +class Google_Trends: + def __init__(self,args = {}): + # https://github.com/GeneralMills/pytrends + self.pytrends = TrendReq(hl='en-US', tz=360) + + def download(self, start_date, end_date, stock = 'apple' ): + self.date_list = pd.date_range(start_date,end_date) + timeframe = [f"{start_date} {end_date}"] + kw_list = [stock] + self.pytrends.build_payload(kw_list=kw_list, timeframe=timeframe) + res = self.pytrends.interest_over_time() + # res.columns = ["date","value"] + return res diff --git a/FinNLP/finnlp/large_language_models/__init__.py b/FinNLP/finnlp/large_language_models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/large_language_models/embeddings/__init__.py b/FinNLP/finnlp/large_language_models/embeddings/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/large_language_models/embeddings/bert.py b/FinNLP/finnlp/large_language_models/embeddings/bert.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/large_language_models/embeddings/finbert.py b/FinNLP/finnlp/large_language_models/embeddings/finbert.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/large_language_models/openai/__init__.py b/FinNLP/finnlp/large_language_models/openai/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/large_language_models/openai/app4gpt_chat_agent.py b/FinNLP/finnlp/large_language_models/openai/app4gpt_chat_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..e1344495d388c8c5bc776d54f44bc609b87c21a5 --- /dev/null +++ b/FinNLP/finnlp/large_language_models/openai/app4gpt_chat_agent.py @@ -0,0 +1,56 @@ +# https://www.app4gpt.com +# A replacement for openai's API in China + +import openai +import numpy as np + +class App4gpt_Chat_Agent: + def __init__(self,args): + assert "token" in args.keys() + openai.api_key = args["token"] + openai.api_base = "https://api.app4gpt.com/v1" + + self.temperature = args["temperature"] if "temperature" in args.keys() else 1 + self.top_p = args["top_p"] if "top_p" in args.keys() else 1 + self.n = args["n"] if "n" in args.keys() else 1 + self.max_tokens = args["max_tokens"] if "max_tokens" in args.keys() else None + self.presence_penalty = args["presence_penalty"] if "presence_penalty" in args.keys() else 0 + self.frequency_penalty = args["frequency_penalty"] if "frequency_penalty" in args.keys() else 0 + + self.conversation_list = [] + if "init_prompt" in args.keys(): + self.conversation_list.append( + {"role":"system","content":args["init_prompt"]} + ) + + def get_single_response(self,prompt, model = "gpt-3.5-turbo"): + self.conversation_list.append({"role":"user","content":prompt}) + response = openai.ChatCompletion.create( + model = model, + messages = self.conversation_list, + temperature = self.temperature, + top_p = self.top_p, + n = self.n, + max_tokens = self.max_tokens, + presence_penalty = self.presence_penalty, + frequency_penalty = self.frequency_penalty, + ) + answer = response.choices[0].message['content'] + self.conversation_list.append({"role":"assistant","content":answer}) + return answer + + def show_conversation(self): + conversation_list = self.conversation_list + for msg in conversation_list: + content = msg['content'] + content = content.replace(".",".\n") + if msg['role'] == 'user': + print(f"\U0001F47B: {content}\n") + elif msg['role'] == 'system': + print(f"\U0001F4BB: {content}\n") + else: + print(f"\U0001F916: {content}\n") + + def get_multiple_response(self,prompts): + pass + \ No newline at end of file diff --git a/FinNLP/finnlp/large_language_models/openai/openai_chat_agent.py b/FinNLP/finnlp/large_language_models/openai/openai_chat_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc6eeb55e8f3ab8f5b06469f6c741e80dcf42f0 --- /dev/null +++ b/FinNLP/finnlp/large_language_models/openai/openai_chat_agent.py @@ -0,0 +1,52 @@ +import openai +import numpy as np + +class Openai_Chat_Agent: + def __init__(self,args): + assert "token" in args.keys() + openai.api_key = args["token"] + + self.temperature = args["temperature"] if "temperature" in args.keys() else 1 + self.top_p = args["top_p"] if "top_p" in args.keys() else 1 + self.n = args["n"] if "n" in args.keys() else 1 + self.max_tokens = args["max_tokens"] if "max_tokens" in args.keys() else None + self.presence_penalty = args["presence_penalty"] if "presence_penalty" in args.keys() else 0 + self.frequency_penalty = args["frequency_penalty"] if "frequency_penalty" in args.keys() else 0 + + self.conversation_list = [] + if "init_prompt" in args.keys(): + self.conversation_list.append( + {"role":"system","content":args["init_prompt"]} + ) + + def get_single_response(self,prompt): + self.conversation_list.append({"role":"user","content":prompt}) + response = openai.ChatCompletion.create( + model = "gpt-3.5-turbo", + messages = self.conversation_list, + temperature = self.temperature, + top_p = self.top_p, + n = self.n, + max_tokens = self.max_tokens, + presence_penalty = self.presence_penalty, + frequency_penalty = self.frequency_penalty, + ) + answer = response.choices[0].message['content'] + self.conversation_list.append({"role":"assistant","content":answer}) + return answer + + def show_conversation(self): + conversation_list = self.conversation_list + for msg in conversation_list: + content = msg['content'] + content = content.replace(".",".\n") + if msg['role'] == 'user': + print(f"\U0001F47B: {content}\n") + elif msg['role'] == 'system': + print(f"\U0001F4BB: {content}\n") + else: + print(f"\U0001F916: {content}\n") + + def get_multiple_response(self,prompts): + pass + diff --git a/FinNLP/finnlp/large_language_models/sentiment/__init__.py b/FinNLP/finnlp/large_language_models/sentiment/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/large_language_models/sentiment/gpt3.py b/FinNLP/finnlp/large_language_models/sentiment/gpt3.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/large_language_models/sentiment/paml.py b/FinNLP/finnlp/large_language_models/sentiment/paml.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/FinNLP/finnlp/utils/get_proxy.py b/FinNLP/finnlp/utils/get_proxy.py new file mode 100644 index 0000000000000000000000000000000000000000..6c45bfce294694780513b7b315843c7ffbbcbd5e --- /dev/null +++ b/FinNLP/finnlp/utils/get_proxy.py @@ -0,0 +1,117 @@ +import requests +import parsel +from lxml import etree +from tqdm import tqdm +import time +import re + +def check_china_ips(proxies_list): + """检测ip的方法""" + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} + + can_use = [] + for proxy in tqdm(proxies_list, desc = "Checking ips"): + try: + response = requests.get('http://www.baidu.com', headers=headers, proxies=proxy, timeout=1) # 超时报错 + if response.status_code == 200: + can_use.append(proxy) + except Exception as error: + # print(error) + pass + return can_use + +def check_us_ips(proxies_list): + """检测ip的方法""" + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} + + can_use = [] + for proxy in tqdm(proxies_list, desc = "Checking ips"): + try: + response = requests.get('http://www.google.com', headers=headers, proxies=proxy, timeout=1) # 超时报错 + if response.status_code == 200: + can_use.append(proxy) + except Exception as error: + # print(error) + pass + return can_use + +def get_china_free_proxy(pages = 10): + proxies_list = [] + for page in tqdm(range(1, pages+1), desc = "Gathering free ips by pages..."): + + base_url = f'https://www.kuaidaili.com/free/inha/{page}' + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} + success = False + while not success: + try: + response = requests.get(base_url, headers=headers) + data = response.text + res = etree.HTML(data) + trs = res.xpath('//table/tbody/tr') + if len(trs)!=0: + success = True + for tr in trs: + proxies_dict = {} + http_type = tr.xpath('./td[4]/text()')[0] + ip_num = tr.xpath('./td[1]/text()')[0] + port_num = tr.xpath('./td[2]/text()')[0] + proxies_dict[http_type] = ip_num + ':' + port_num + proxies_list.append(proxies_dict) + else: + time.delay(0.01) + + except: + pass + + can_use = check_china_ips(proxies_list) + + print(f'获取到的代理ip数量: {len(proxies_list)} 。Get proxy ips: {len(proxies_list)}.') + print(f'能用的代理数量: {len(can_use)}。Usable proxy ips: {len(can_use)}.' ) + + return can_use + +def get_us_free_proxy(pages = 10): + url = "https://openproxy.space/list/http" + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'} + response = requests.get(url, headers=headers) + if response.status_code != 200: + print("Connection Error. Please make sure that your computer now have the access to Google.com") + res = etree.HTML(response.text) + http_type = "HTTP" + proxies_list = [] + + scripts = res.xpath("//script") + content = scripts[3].xpath(".//text()") + pattern = re.compile('LIST",data:(.+),added:') + result_list = pattern.findall(content[0]) + result_list = result_list[0].strip("[{").strip("}]").split("},{") + + for result in result_list: + pattern = re.compile('\[(.+)\]') + result = pattern.findall(result) + result = result[0].split(",") + result = [r.strip("\"") for r in result] + for ip in result: + proxies_list.append( + {http_type: ip} + ) + total = pages* 15 + proxies_list = proxies_list[:total] + can_use = check_us_ips(proxies_list) + print(f'Get proxy ips: {len(proxies_list)}.') + print(f'Usable proxy ips: {len(can_use)}.' ) + + return can_use + +class Kuaidaili: + def __init__(self, tunnel, username, password): + self.tunnel = tunnel + self.username = username + self.password = password + + def get_kuaidaili_tunnel_proxy(self): + proxies = { + "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel}, + "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": self.username, "pwd": self.password, "proxy": self.tunnel} + } + return proxies \ No newline at end of file diff --git a/FinNLP/markdowns/codes.md b/FinNLP/markdowns/codes.md new file mode 100644 index 0000000000000000000000000000000000000000..6b63c9c3eb018366ddc170c4785a4e6dc9f787b3 --- /dev/null +++ b/FinNLP/markdowns/codes.md @@ -0,0 +1,82 @@ +# FinNLP + +## Codes + +### Data Sources + +#### News (Finnhub, Sina) + +``` python +class News_Downloader: + + def __init__(self, args = {}): + pass + + def download_date_range_all(self, start_date, end_date): + pass + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): + pass + + def download_streaming_all(self, rounds = 3): + pass + + def download_streaming_stock(self, stock = None, rounds = 3): + pass + + def clean_data(self): + pass + + def gather_content(self, delay = 0.01): + pass +``` + + + +#### Social Media (Twitter, Stocktwits, Reddit, Weibo) + +``` python +class Social_Media_Downloader: + + def __init__(self, args = {}): + pass + + def download_date_range_all(self, start_date, end_date): + pass + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): + pass + + def download_streaming_all(self, rounds = 3): + pass + + def download_streaming_stock(self, stock = None, rounds = 3): + pass + + def clean_data(self): + pass +``` + +#### Company Announcement (Juchao, SEC) + +``` python +class company_announcement_Downloader: + + def __init__(self, args = {}): + pass + + def download_date_range_all(self, start_date, end_date): + pass + + def download_date_range_stock(self, start_date, end_date, stock = "AAPL"): + pass + + def download_streaming_all(self, rounds = 3): + pass + + def download_streaming_stock(self, stock = None, rounds = 3): + pass + + def clean_data(self): + pass +``` \ No newline at end of file diff --git a/FinNLP/requirements.txt b/FinNLP/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5e75ec5a2dc572168f8ea5ef5a72b43e141df83 --- /dev/null +++ b/FinNLP/requirements.txt @@ -0,0 +1,9 @@ + +akshare +tushare +finnhub-python +parsel +requests +pandas +tqdm +pytz diff --git a/FinNLP/setup.py b/FinNLP/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..35d24958387ef839985d27f8f985589851b832e5 --- /dev/null +++ b/FinNLP/setup.py @@ -0,0 +1,39 @@ +from setuptools import setup, find_packages + +# Read requirements.txt, ignore comments +try: + with open("requirements.txt", "r") as f: + REQUIRES = [line.split('#', 1)[0].strip() for line in f if line.strip()] +except: + print("'requirements.txt' not found!") + REQUIRES = list() + +setup( + name="FinNLP", + version="0.0.1", + include_package_data=True, + author="AI4Finance Foundation", + author_email="contact@ai4finance.org", + url="https://github.com/AI4Finance-Foundation/FinNLP", + license="MIT", + packages=find_packages(), + install_requires=REQUIRES, + description="FinNLP", + long_description="""FinNLP""", + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + ], + keywords="Financial Large Language Models", + platforms=["any"], + python_requires=">=3.6", +) diff --git a/FinNLP/test/Data_Cleaning_Pipeline.ipynb b/FinNLP/test/Data_Cleaning_Pipeline.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d52969731ead13efdfd82bb1a57620a8db3f8e9f --- /dev/null +++ b/FinNLP/test/Data_Cleaning_Pipeline.ipynb @@ -0,0 +1,467 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../FinNLP\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "import shutil\n", + "import pandas as pd\n", + "from finnlp.data_engineering.data_cleaning import * " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Downloading sample data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def download_parquet_files(url_list, local_dir):\n", + " for url in url_list:\n", + " file_name = url.split('/')[-1]\n", + " local_file = os.path.join(local_dir, file_name)\n", + " if not os.path.exists(local_dir):\n", + " os.makedirs(local_dir)\n", + "\n", + " r = requests.get(url, stream=True)\n", + " if r.status_code == 200:\n", + " with open(local_file, 'wb+') as f:\n", + " r.raw.decode_content = True\n", + " shutil.copyfileobj(r.raw, f)\n", + " else:\n", + " print('download failed: ', url)\n", + "\n", + "def web_data_prepare(name):\n", + " r = requests.get(\"/static-proxy?url=https%3A%2F%2Fdatasets-server.huggingface.co%2Fparquet%3Fdataset%3D%5C"+name)\n", + " j = r.json()\n", + " urls = [f['url'] for f in j['parquet_files'] if f['split'] == 'train']\n", + " train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']\n", + " test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']\n", + " download_parquet_files(train_urls, 'train_dataset')\n", + " download_parquet_files(test_urls, 'test_dataset')\n", + "\n", + " train_dataset = pd.read_parquet('./train_dataset', engine='pyarrow')\n", + " test_dataset = pd.read_parquet('./test_dataset', engine='pyarrow')\n", + "\n", + " # train_dataset.rebalance()\n", + " # test_dataset.rebalance()\n", + " return train_dataset, test_dataset\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textdategenderagehoroscopejob
395587it has just hit me that in a matter of a few s...04,August,2004female27VirgoindUnk
275435Ok, Dear Dopugie/ Ben/ Matt/ anyone who can he...27,June,2004male14SagittariusStudent
634637ooyeah/Happy Roctober = Miriam? Anyway, just ...03,August,2004female24LibraArts
264675Election season is in the air! And I know that...19,February,2004male24GeminiindUnk
628907His face bore the the signs of the many battle...03,February,2004male24LibraindUnk
.....................
41392Everything You Wanted to Know About Oscillatin...04,October,2003female40GeminiLaw
71956Sick Chickens March has not been a good mont...08,August,2004male26CapricornCommunications-Media
21415Gunshot as you are more likely to die quickly ...26,March,2003male17TaurusTechnology
255686Well I am sad to report that my Uncle has pass...07,March,2004female27CapricornindUnk
401130urlLink A picture from the 'Precious Moment...10,June,2004male33GeminiTechnology
\n", + "

6898 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " text date \\\n", + "395587 it has just hit me that in a matter of a few s... 04,August,2004 \n", + "275435 Ok, Dear Dopugie/ Ben/ Matt/ anyone who can he... 27,June,2004 \n", + "634637 ooyeah/Happy Roctober = Miriam? Anyway, just ... 03,August,2004 \n", + "264675 Election season is in the air! And I know that... 19,February,2004 \n", + "628907 His face bore the the signs of the many battle... 03,February,2004 \n", + "... ... ... \n", + "41392 Everything You Wanted to Know About Oscillatin... 04,October,2003 \n", + "71956 Sick Chickens March has not been a good mont... 08,August,2004 \n", + "21415 Gunshot as you are more likely to die quickly ... 26,March,2003 \n", + "255686 Well I am sad to report that my Uncle has pass... 07,March,2004 \n", + "401130 urlLink A picture from the 'Precious Moment... 10,June,2004 \n", + "\n", + " gender age horoscope job \n", + "395587 female 27 Virgo indUnk \n", + "275435 male 14 Sagittarius Student \n", + "634637 female 24 Libra Arts \n", + "264675 male 24 Gemini indUnk \n", + "628907 male 24 Libra indUnk \n", + "... ... ... ... ... \n", + "41392 female 40 Gemini Law \n", + "71956 male 26 Capricorn Communications-Media \n", + "21415 male 17 Taurus Technology \n", + "255686 female 27 Capricorn indUnk \n", + "401130 male 33 Gemini Technology \n", + "\n", + "[6898 rows x 6 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "name = 'blog_authorship_corpus'\n", + "sample_ratio = 0.01\n", + "# train_dataset, test_dataset=web_data_prepare(name = name)\n", + "train_dataset = pd.read_parquet('./train_dataset', engine='pyarrow')\n", + "test_dataset = pd.read_parquet('./test_dataset', engine='pyarrow')\n", + "train_dataset=train_dataset.sample(frac=sample_ratio)\n", + "test_dataset=test_dataset.sample(frac=sample_ratio)\n", + "train_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Junk Data\n", + "Large-scale datasets often contain an uneven distribution of text representation, which includes a significant amount of nonsensical and boilerplate text - such as HTML tags.\n", + "\n", + "The presence of such \"noise\" or irrelevant content in the dataset is detrimental to the training of predictive models, specifically those that operate by predicting the next token based on all previous ones. Therefore, it's crucial to clean the dataset and remove these undesired elements prior to the training phase.\n", + "\n", + "This piece of Python code calculated a measure of \"impurity\" in text documents, and then computing the proportion of documents that exceed a certain impurity threshold. It defines a compiled regular expression that matches any of the following suspicious characters: &, #, <, >, {, }, [, ]." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.049724557842853" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re_e = r'[&#<>{}\\[\\]\\\\]'\n", + "threshold = 0.01\n", + "df, impurity_ratio = junk_eliminate(train_dataset, re_e, threshold)\n", + "impurity_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Biased Content\n", + "It is crucial in the training of language models to be vigilant and potentially apply tools to exclude toxic content from the pre-training datasets. This practice helps to prevent the models from demonstrating bias or generating detrimental content in subsequent applications.\n", + "\n", + "One approach to address this issue is by scanning the text for offensive words. For instance, the creators of the C4 dataset have implemented such a filtering mechanism. The follow code references this word list that they open source.\n", + "\n", + "The following code utilizes the word list to quantify the \"biased content ratio\" in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.1284430269643375" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_toxic_df, biased_content_ratio = toxic_eliminate(df = train_dataset, l_kind = 'en')\n", + "biased_content_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Too Short Document\n", + "The aim of language modeling is to master the generation of text based on preceding tokens. In this scenario, eliminating extremely brief documents (text consisting of fewer than approximately 100 tokens) from the corpus could aid in the reduction of noise, by producing contiguous text to model dependencies within the text.\n", + "\n", + "Use the Hugging Face Transformers library to tokenize text and then calculate the proportion of documents that are \"too short\" in a dataset. This example converts text into tokens that the BERT model can understand. Choose a tokenizer for your model." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.40895911858509715" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "df = train_dataset\n", + "not_short_df, too_short_doc_ratio = short_eliminate(df, tokenizer, min_len=100)\n", + "too_short_doc_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Contamination\n", + "Typically, ensuring the segregation of training and testing data is rather straightforward in machine learning. However, things become complicated in the context of large language models where both the training and benchmarking datasets are collected from the internet.\n", + "\n", + "For instance, the performance evaluation of a large language model using benchmark data (like question-answer pairs) can be significantly affected if the benchmark data also features in the model's training set. The procedure of eliminating instances from the training datasets that intersect with the existing benchmarking datasets is called \"decontamination\".\n", + "\n", + "This Python code below is being used to quantify the contamination problem lying in the datasets, i.e., the proportion of documents in the test set that also appear in the training set using N-grams.\n", + "\n", + "The approach here is from GPT-3 paper. OpenAI defined a test document as contaminated if any N-gram overlap existed with any training document. (They used a range of N values between 8 and 13 depending on dataset.) When constructing the WebText dataset, OpenAI researchers decontaminated the data by eliminating all Wikipedia content from the training set. This was necessary as Wikipedia data was heavily used in their benchmark datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.021108179419525065" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "contamination_ratio = contamination_eliminate(train_dataset, test_dataset)\n", + "contamination_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Duplication\n", + "When datasets are created by scraping raw text from the Internet, this will often result in the same sequences being repeated multiple times. This paper mentions a single 50 word sequence that is repeated in the C4 dataset 60,000 times.\n", + "\n", + "Deduplication helps prevent models from outputting verbatim training data when there are many duplicates, and makes models less vulnerable to privacy attacks. Deduplication can also improve model training efficiency and prevent benchmark contamination.\n", + "\n", + "Tools & Tutorials\n", + "The GPT-3 paper mentions they fuzzily deduplicated documents within each dataset using Spark’s MinHashLSH implementation with 10 hashes.\n", + "\n", + "deduplicate-text-datasets is an ExactSubstr deduplication implementation (written in Rust) along with the scripts to perform ExactSubstr deduplication and inspect the results (written in Python).\n", + "\n", + "datasketch gives you probabilistic data structures that can process and search very large amount of data super fast, with little loss of accuracy.\n", + "\n", + "This article provides a MinHash walkthrough to demonstrate how to implement a parallelel deduplication.\n", + "\n", + "The following code uses the datasketch library and LSH (Locality Sensitive Hashing) to deduplicate the dataset. For each text in the DataFrame, it creates a query MinHash object and performs a query on the LSH index to find similar documents.\n", + "\n", + "It worths to mention that the de-duplication process usually requires a lot of computational resources (CPU and RAM) due to the size of web crawl datasets and it's therefore recommended to run such computations in distributed settings." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unique_documents, duplication_ratio = duplication_eliminate(train_dataset.sample(frac=sample_ratio))\n", + "duplication_ratio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thanks to [HuggingFace-Datasets-Text-Quality-Analysis](https://huggingface.co/spaces/Dreamsome/HuggingFace-Datasets-Text-Quality-Analysis)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langchain", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/test/Data_Sources_Company_Announcement.ipynb b/FinNLP/test/Data_Sources_Company_Announcement.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..505f2c3b2048c317a0410fd7eda69eb8f42a6606 --- /dev/null +++ b/FinNLP/test/Data_Sources_Company_Announcement.ipynb @@ -0,0 +1,783 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../FinNLP\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SEC" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.company_announcement.sec import SEC_Announcement" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2020-01-01\"\n", + "end_date = \"2020-06-01\"\n", + "stock = \"AAPL\"\n", + "config = {\n", + " \"use_proxy\": \"us_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 3,\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Checking ips: 100%|██████████| 45/45 [01:42<00:00, 2.28s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get proxy ips: 45.\n", + "Usable proxy ips: 44.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading by item...: 100%|██████████| 39/39 [01:39<00:00, 2.54s/it]\n" + ] + } + ], + "source": [ + "downloader = SEC_Announcement(config)\n", + "downloader.download_date_range_stock(start_date, end_date, stock = stock)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
_idciksperiod_endingroot_formfile_numdisplay_namesxslsequencefile_datebiz_statessicsformadshfilm_numbiz_locationsfile_typefile_descriptioninc_statesitecontent
00000320193-20-000056:wf-form4_158932261319105.xml[0001631982, 0000320193]2020-05-084[][KONDO CHRIS (CIK 0001631982), Apple Inc. (A...xslF345X0312020-05-12[][3571]40000320193-20-000056[][, ]4FORM 4[, CA, ][]SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
10000320193-20-000054:wf-form4_158829658358801.xml[0001051401, 0000320193]2020-04-284[001-36743][JUNG ANDREA (CIK 0001051401), Apple Inc. (A...xslF345X0312020-04-30[CA][3571]40000320193-20-000054[20838087][, Cupertino, CA]4FORM 4[, CA][]SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
\n", + "
" + ], + "text/plain": [ + " _id \\\n", + "0 0000320193-20-000056:wf-form4_158932261319105.xml \n", + "1 0000320193-20-000054:wf-form4_158829658358801.xml \n", + "\n", + " ciks period_ending root_form file_num \\\n", + "0 [0001631982, 0000320193] 2020-05-08 4 [] \n", + "1 [0001051401, 0000320193] 2020-04-28 4 [001-36743] \n", + "\n", + " display_names xsl sequence \\\n", + "0 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... xslF345X03 1 \n", + "1 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... xslF345X03 1 \n", + "\n", + " file_date biz_states sics form adsh film_num \\\n", + "0 2020-05-12 [] [3571] 4 0000320193-20-000056 [] \n", + "1 2020-04-30 [CA] [3571] 4 0000320193-20-000054 [20838087] \n", + "\n", + " biz_locations file_type file_description inc_states ite \\\n", + "0 [, ] 4 FORM 4 [, CA, ] [] \n", + "1 [, Cupertino, CA] 4 FORM 4 [, CA] [] \n", + "\n", + " content \n", + "0 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "1 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "# df = df.drop_duplicates()\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21, 20)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
file_datedisplay_namescontent
02020-05-12[KONDO CHRIS (CIK 0001631982), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
12020-04-30[JUNG ANDREA (CIK 0001051401), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
22020-04-17[O'BRIEN DEIRDRE (CIK 0001767094), Apple Inc....SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
32020-04-17[KONDO CHRIS (CIK 0001631982), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
42020-04-09[Maestri Luca (CIK 0001513362), Apple Inc. (...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
52020-04-03[WILLIAMS JEFFREY E (CIK 0001496686), Apple I...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
62020-04-03[Maestri Luca (CIK 0001513362), Apple Inc. (...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
72020-02-28[WAGNER SUSAN (CIK 0001059235), Apple Inc. (...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
82020-02-28[LEVINSON ARTHUR D (CIK 0001214128), Apple In...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
92020-02-28[JUNG ANDREA (CIK 0001051401), Apple Inc. (A...SEC Form 4 \\n FORM 4UNITED STATES SECURITIES...
\n", + "
" + ], + "text/plain": [ + " file_date display_names \\\n", + "0 2020-05-12 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... \n", + "1 2020-04-30 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... \n", + "2 2020-04-17 [O'BRIEN DEIRDRE (CIK 0001767094), Apple Inc.... \n", + "3 2020-04-17 [KONDO CHRIS (CIK 0001631982), Apple Inc. (A... \n", + "4 2020-04-09 [Maestri Luca (CIK 0001513362), Apple Inc. (... \n", + "5 2020-04-03 [WILLIAMS JEFFREY E (CIK 0001496686), Apple I... \n", + "6 2020-04-03 [Maestri Luca (CIK 0001513362), Apple Inc. (... \n", + "7 2020-02-28 [WAGNER SUSAN (CIK 0001059235), Apple Inc. (... \n", + "8 2020-02-28 [LEVINSON ARTHUR D (CIK 0001214128), Apple In... \n", + "9 2020-02-28 [JUNG ANDREA (CIK 0001051401), Apple Inc. (A... \n", + "\n", + " content \n", + "0 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "1 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "2 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "3 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "4 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "5 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "6 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "7 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "8 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... \n", + "9 SEC Form 4 \\n FORM 4UNITED STATES SECURITIES... " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"file_date\", \"display_names\", \"content\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Juchao" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.company_announcement.juchao import Juchao_Announcement" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2020-01-01\"\n", + "end_date = \"2020-06-01\"\n", + "stock = \"000001\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 3,\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 3/3 [00:05<00:00, 1.86s/it]\n", + "Checking ips: 100%|██████████| 45/45 [00:48<00:00, 1.09s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 45 。Get proxy ips: 45.\n", + "能用的代理数量: 6。Usable proxy ips: 6.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1bb13261e75147929b30222347ab9cc5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading by page...: 0%| | 0/2 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsecCodesecNameorgIdannouncementIdannouncementTitleannouncementTimeadjunctUrladjunctSizeadjunctType...importantbatchNumannouncementContentorgNametileSecNameshortTitleannouncementTypeNamesecNameListPDF_pathContent
0None000001平安银行gssz00000011207862647关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告2020-05-27finalpage/2020-05-27/1207862647.PDF148PDF...NoneNoneNone平安银行关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告NoneNoneremoved证券代码: 000001 证券简称:平安银行 ...
1None000001平安银行gssz000000112078436882019年年度权益分派实施公告2020-05-22finalpage/2020-05-22/1207843688.PDF214PDF...NoneNoneNone平安银行2019年年度权益分派实施公告NoneNoneremoved1 证券代码: 000001 证券简称:平安银行 ...
\n", + "

2 rows × 25 columns

\n", + "" + ], + "text/plain": [ + " id secCode secName orgId announcementId \\\n", + "0 None 000001 平安银行 gssz0000001 1207862647 \n", + "1 None 000001 平安银行 gssz0000001 1207843688 \n", + "\n", + " announcementTitle announcementTime \\\n", + "0 关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告 2020-05-27 \n", + "1 2019年年度权益分派实施公告 2020-05-22 \n", + "\n", + " adjunctUrl adjunctSize adjunctType ... \\\n", + "0 finalpage/2020-05-27/1207862647.PDF 148 PDF ... \n", + "1 finalpage/2020-05-22/1207843688.PDF 214 PDF ... \n", + "\n", + " important batchNum announcementContent orgName tileSecName \\\n", + "0 None None None 平安银行 \n", + "1 None None None 平安银行 \n", + "\n", + " shortTitle announcementTypeName secNameList PDF_path \\\n", + "0 关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告 None None removed \n", + "1 2019年年度权益分派实施公告 None None removed \n", + "\n", + " Content \n", + "0 证券代码: 000001 证券简称:平安银行 ... \n", + "1 1 证券代码: 000001 证券简称:平安银行 ... \n", + "\n", + "[2 rows x 25 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(42, 25)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
announcementTimeshortTitleContent
02020-05-27关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告证券代码: 000001 证券简称:平安银行 ...
12020-05-222019年年度权益分派实施公告1 证券代码: 000001 证券简称:平安银行 ...
22020-05-20关于获准发行小微企业贷款专项金融债券的公告证券代码: 000001 证券简称:平安银行 ...
32020-05-16监事会决议公告1 证券代码: 000001 证券简称: 平安银行 ...
42020-05-152019年年度股东大会决议公告1 证券代码: 000001 证券简称:平安银行 ...
52020-05-152019年年度股东大会的法律意见书北京总部 电话 : (86 -10) 8519 -1300 传真 : (86 -10...
62020-04-30中信证券股份有限公司、平安证券股份有限公司关于公司关联交易有关事项的核查意见1 中信证券股份有限公司 、平安证券股份有限 公司 关于平安银行股份有限公司 关联交易 有...
72020-04-30独立董事独立意见1 平安银行股份有限公司独立董事独立意见 根据《关于在上市公司建立独立董事制度的指导...
82020-04-30关联交易公告1 证券代码: 000001 证券简称:平安银行 ...
92020-04-212020年第一季度报告全文证券代码: 000001 证券简称:平安银行 ...
\n", + "
" + ], + "text/plain": [ + " announcementTime shortTitle \\\n", + "0 2020-05-27 关于2020年第一期小型微型企业贷款专项金融债券发行完毕的公告 \n", + "1 2020-05-22 2019年年度权益分派实施公告 \n", + "2 2020-05-20 关于获准发行小微企业贷款专项金融债券的公告 \n", + "3 2020-05-16 监事会决议公告 \n", + "4 2020-05-15 2019年年度股东大会决议公告 \n", + "5 2020-05-15 2019年年度股东大会的法律意见书 \n", + "6 2020-04-30 中信证券股份有限公司、平安证券股份有限公司关于公司关联交易有关事项的核查意见 \n", + "7 2020-04-30 独立董事独立意见 \n", + "8 2020-04-30 关联交易公告 \n", + "9 2020-04-21 2020年第一季度报告全文 \n", + "\n", + " Content \n", + "0 证券代码: 000001 证券简称:平安银行 ... \n", + "1 1 证券代码: 000001 证券简称:平安银行 ... \n", + "2 证券代码: 000001 证券简称:平安银行 ... \n", + "3 1 证券代码: 000001 证券简称: 平安银行 ... \n", + "4 1 证券代码: 000001 证券简称:平安银行 ... \n", + "5 北京总部 电话 : (86 -10) 8519 -1300 传真 : (86 -10... \n", + "6 1 中信证券股份有限公司 、平安证券股份有限 公司 关于平安银行股份有限公司 关联交易 有... \n", + "7 1 平安银行股份有限公司独立董事独立意见 根据《关于在上市公司建立独立董事制度的指导... \n", + "8 1 证券代码: 000001 证券简称:平安银行 ... \n", + "9 证券代码: 000001 证券简称:平安银行 ... " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"announcementTime\", \"shortTitle\",\"Content\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "afd6dc03c9be451573fc2885de79a969af6a24a159f11a3ead741ab7a9ff405f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/test/Data_Sources_Finnhub_Sentiment.ipynb b/FinNLP/test/Data_Sources_Finnhub_Sentiment.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fd597e66d6c9703e84e32cbfa73fd5144d493214 --- /dev/null +++ b/FinNLP/test/Data_Sources_Finnhub_Sentiment.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.finnhub_sentiment import Finnhub_Sentiment" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Config" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "stock = \"AAPL\"\n", + "start_date = \"2023-01-16\"\n", + "end_date = \"2023-02-19\"\n", + "token = \"YOUR_FINNHUB_TOKEN\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### News_downloader" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "downloader = Finnhub_Sentiment({\"token\":token})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "75c33d2d1c984dbd959c21fdbaa437f4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/9 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
atTimementionpositiveScorenegativeScorepositiveMentionnegativeMentionscore
552023-01-16 00:00:0030.832384-0.99216611-0.087573
542023-01-16 01:00:0010.000000-0.73855801-1.000000
532023-01-16 03:00:0010.8715290.000000101.000000
522023-01-16 06:00:0010.000000-0.99566301-1.000000
512023-01-16 08:00:0010.000000-0.99991701-1.000000
\n", + "" + ], + "text/plain": [ + " atTime mention positiveScore negativeScore \\\n", + "55 2023-01-16 00:00:00 3 0.832384 -0.992166 \n", + "54 2023-01-16 01:00:00 1 0.000000 -0.738558 \n", + "53 2023-01-16 03:00:00 1 0.871529 0.000000 \n", + "52 2023-01-16 06:00:00 1 0.000000 -0.995663 \n", + "51 2023-01-16 08:00:00 1 0.000000 -0.999917 \n", + "\n", + " positiveMention negativeMention score \n", + "55 1 1 -0.087573 \n", + "54 0 1 -1.000000 \n", + "53 1 0 1.000000 \n", + "52 0 1 -1.000000 \n", + "51 0 1 -1.000000 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.reddit.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(571, 7)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.twitter.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
atTimementionpositiveScorenegativeScorepositiveMentionnegativeMentionscore
632023-01-16 00:00:001850.944244-0.934446104560.304738
622023-01-16 01:00:001750.957532-0.93471182740.063303
612023-01-16 02:00:001490.953763-0.9437356771-0.023705
602023-01-16 03:00:001760.952826-0.94147898580.261998
592023-01-16 04:00:002340.917571-0.933355119940.108952
\n", + "
" + ], + "text/plain": [ + " atTime mention positiveScore negativeScore \\\n", + "63 2023-01-16 00:00:00 185 0.944244 -0.934446 \n", + "62 2023-01-16 01:00:00 175 0.957532 -0.934711 \n", + "61 2023-01-16 02:00:00 149 0.953763 -0.943735 \n", + "60 2023-01-16 03:00:00 176 0.952826 -0.941478 \n", + "59 2023-01-16 04:00:00 234 0.917571 -0.933355 \n", + "\n", + " positiveMention negativeMention score \n", + "63 104 56 0.304738 \n", + "62 82 74 0.063303 \n", + "61 67 71 -0.023705 \n", + "60 98 58 0.261998 \n", + "59 119 94 0.108952 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.twitter.head(5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "afd6dc03c9be451573fc2885de79a969af6a24a159f11a3ead741ab7a9ff405f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/test/Data_Sources_News.ipynb b/FinNLP/test/Data_Sources_News.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..98ce774188e76fc27f4ed7fb8dc05f93ee6e4f1b --- /dev/null +++ b/FinNLP/test/Data_Sources_News.ipynb @@ -0,0 +1,3949 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../FinNLP\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CNBS" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.cnbc_streaming import CNBC_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = CNBC_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(30, 30)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['description', 'cn:lastPubDate', 'dateModified', 'cn:dateline',\n", + " 'cn:branding', 'section', 'cn:type', 'author', 'cn:source',\n", + " 'cn:subtype', 'duration', 'summary', 'expires', 'cn:sectionSubType',\n", + " 'cn:contentClassification', 'pubdateunix', '_id', 'url', '@id',\n", + " 'datePublished', 'cn:promoImage', 'cn:title', 'cn:keyword',\n", + " 'cn:liveURL', '_pubDate', '_type', '_index', 'brand', 'hint',\n", + " 'hint_detail'],\n", + " dtype='object')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
descriptioncn:lastPubDatedateModifiedcn:datelinecn:brandingsectioncn:typeauthorcn:sourcecn:subtype...cn:promoImagecn:titlecn:keywordcn:liveURL_pubDate_type_indexbrandhinthint_detail
0While Leah Ellis was earning her doctorate at ...2023-06-24T10:00:01+00002023-06-24T10:00:01+0000cnbcClean TechcnbcnewsstoryCatherine Clifford[]...https://image.cnbcfm.com/api/v1/image/10726095...Meet the 33-year-old Canadian chemist and the ...https://www.cnbc.com/2023/06/24/sublime-system...6/24/2023 10:00:01 PM00cnbcNaNNaN
1Amazon.com said on Friday it will take its inv...2023-06-24T04:50:41+00002023-06-24T04:50:41+0000cnbcTechnologywirestory[]...https://image.cnbcfm.com/api/v1/image/10726178...Amazon raises investment in India to $26 billi...https://www.cnbc.com/2023/06/24/amazon-commits...6/24/2023 1:49:10 PM01cnbcNaNNaN
\n", + "

2 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " description \\\n", + "0 While Leah Ellis was earning her doctorate at ... \n", + "1 Amazon.com said on Friday it will take its inv... \n", + "\n", + " cn:lastPubDate dateModified cn:dateline cn:branding \\\n", + "0 2023-06-24T10:00:01+0000 2023-06-24T10:00:01+0000 cnbc \n", + "1 2023-06-24T04:50:41+0000 2023-06-24T04:50:41+0000 cnbc \n", + "\n", + " section cn:type author cn:source cn:subtype ... \\\n", + "0 Clean Tech cnbcnewsstory Catherine Clifford [] ... \n", + "1 Technology wirestory [] ... \n", + "\n", + " cn:promoImage \\\n", + "0 https://image.cnbcfm.com/api/v1/image/10726095... \n", + "1 https://image.cnbcfm.com/api/v1/image/10726178... \n", + "\n", + " cn:title cn:keyword \\\n", + "0 Meet the 33-year-old Canadian chemist and the ... \n", + "1 Amazon raises investment in India to $26 billi... \n", + "\n", + " cn:liveURL _pubDate \\\n", + "0 https://www.cnbc.com/2023/06/24/sublime-system... 6/24/2023 10:00:01 PM \n", + "1 https://www.cnbc.com/2023/06/24/amazon-commits... 6/24/2023 1:49:10 PM \n", + "\n", + " _type _index brand hint hint_detail \n", + "0 0 0 cnbc NaN NaN \n", + "1 0 1 cnbc NaN NaN \n", + "\n", + "[2 rows x 30 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datePublishedcn:lastPubDatedateModifieddescriptionsectionauthorsummarycn:titlecn:type
02023-06-24T14:00:01+00002023-06-24T10:00:01+00002023-06-24T10:00:01+0000While Leah Ellis was earning her doctorate at ...Clean TechCatherine CliffordSublime Systems is scaling up a green cement. ...Meet the 33-year-old Canadian chemist and the ...cnbcnewsstory
12023-06-24T05:49:10+00002023-06-24T04:50:41+00002023-06-24T04:50:41+0000Amazon.com said on Friday it will take its inv...TechnologyModi and Jassy spoke about supporting Indian s...Amazon raises investment in India to $26 billi...wirestory
22023-06-23T22:12:07+00002023-06-23T18:29:45+00002023-06-23T18:29:45+0000As Microsoft attempts to convince regulators t...TechnologyJordan NovetMicrosoft has been eager to grow in mobile gam...Microsoft says it looked at acquiring Zynga bu...cnbcnewsstory
32023-06-23T21:51:15+00002023-06-23T17:51:15+00002023-06-23T17:51:15+0000The CEOs of Apple, Alphabet, Microsoft got a h...TechnologySeema ModyTop tech execs met with Indian Prime Minister ...Apple's Tim Cook calls India 'huge opportunity...cnbcnewsstory
42023-06-23T17:32:48+00002023-06-23T13:36:59+00002023-06-23T13:36:59+0000Tech executives like Apple CEO Tim Cook visit ...Fast Money Halftime ReportSeema ModyTech executives like Apple CEO Tim Cook visit ...Tech CEOs meet President Biden and Indian PM M...cnbcvideo
52023-06-23T12:30:17+00002023-06-23T13:32:42+00002023-06-23T13:32:42+0000Anyone want to buy or sell this tech rally? To...Pro: Pro ColumnistsBob PisaniFollowing the rebalancing of S&P indexes last ...Friday could offer a once-in-a-year chance to ...cnbcnewsstory
62023-06-23T12:26:42+00002023-06-23T11:47:06+00002023-06-23T11:47:06+0000Here are Friday's biggest calls on Wall Street...Pro: Analyst Stock PicksMichael BloomHere are Friday's biggest calls on Wall Street.Here are Friday's biggest analyst calls: Meta,...cnbcnewsstory
72023-06-23T06:30:01+00002023-06-23T02:30:01+00002023-06-23T02:30:01+0000This report is from today's CNBC Daily Open, o...Daily OpenYeo Boon PingInvestors have been lulled by a sense of secur...CNBC Daily Open: Seeking shelter in techcnbcnewsstory
82023-06-23T05:45:33+00002023-06-23T10:37:42+00002023-06-23T10:37:42+0000AMSTERDAM — Artificial intelligence has a raci...TechnologyRyan BrowneWhen it comes to banking and financial service...A.I. has a discrimination problem. In banking,...cnbcnewsstory
92023-06-22T23:43:01+00002023-06-23T01:01:10+00002023-06-23T01:01:10+0000This report is from today's CNBC Daily Open, o...Daily OpenYeo Boon PingInvestors have been lulled by a sense of secur...CNBC Daily Open: Rate hikes and red lightscnbcnewsstory
\n", + "
" + ], + "text/plain": [ + " datePublished cn:lastPubDate \\\n", + "0 2023-06-24T14:00:01+0000 2023-06-24T10:00:01+0000 \n", + "1 2023-06-24T05:49:10+0000 2023-06-24T04:50:41+0000 \n", + "2 2023-06-23T22:12:07+0000 2023-06-23T18:29:45+0000 \n", + "3 2023-06-23T21:51:15+0000 2023-06-23T17:51:15+0000 \n", + "4 2023-06-23T17:32:48+0000 2023-06-23T13:36:59+0000 \n", + "5 2023-06-23T12:30:17+0000 2023-06-23T13:32:42+0000 \n", + "6 2023-06-23T12:26:42+0000 2023-06-23T11:47:06+0000 \n", + "7 2023-06-23T06:30:01+0000 2023-06-23T02:30:01+0000 \n", + "8 2023-06-23T05:45:33+0000 2023-06-23T10:37:42+0000 \n", + "9 2023-06-22T23:43:01+0000 2023-06-23T01:01:10+0000 \n", + "\n", + " dateModified \\\n", + "0 2023-06-24T10:00:01+0000 \n", + "1 2023-06-24T04:50:41+0000 \n", + "2 2023-06-23T18:29:45+0000 \n", + "3 2023-06-23T17:51:15+0000 \n", + "4 2023-06-23T13:36:59+0000 \n", + "5 2023-06-23T13:32:42+0000 \n", + "6 2023-06-23T11:47:06+0000 \n", + "7 2023-06-23T02:30:01+0000 \n", + "8 2023-06-23T10:37:42+0000 \n", + "9 2023-06-23T01:01:10+0000 \n", + "\n", + " description \\\n", + "0 While Leah Ellis was earning her doctorate at ... \n", + "1 Amazon.com said on Friday it will take its inv... \n", + "2 As Microsoft attempts to convince regulators t... \n", + "3 The CEOs of Apple, Alphabet, Microsoft got a h... \n", + "4 Tech executives like Apple CEO Tim Cook visit ... \n", + "5 Anyone want to buy or sell this tech rally? To... \n", + "6 Here are Friday's biggest calls on Wall Street... \n", + "7 This report is from today's CNBC Daily Open, o... \n", + "8 AMSTERDAM — Artificial intelligence has a raci... \n", + "9 This report is from today's CNBC Daily Open, o... \n", + "\n", + " section author \\\n", + "0 Clean Tech Catherine Clifford \n", + "1 Technology \n", + "2 Technology Jordan Novet \n", + "3 Technology Seema Mody \n", + "4 Fast Money Halftime Report Seema Mody \n", + "5 Pro: Pro Columnists Bob Pisani \n", + "6 Pro: Analyst Stock Picks Michael Bloom \n", + "7 Daily Open Yeo Boon Ping \n", + "8 Technology Ryan Browne \n", + "9 Daily Open Yeo Boon Ping \n", + "\n", + " summary \\\n", + "0 Sublime Systems is scaling up a green cement. ... \n", + "1 Modi and Jassy spoke about supporting Indian s... \n", + "2 Microsoft has been eager to grow in mobile gam... \n", + "3 Top tech execs met with Indian Prime Minister ... \n", + "4 Tech executives like Apple CEO Tim Cook visit ... \n", + "5 Following the rebalancing of S&P indexes last ... \n", + "6 Here are Friday's biggest calls on Wall Street. \n", + "7 Investors have been lulled by a sense of secur... \n", + "8 When it comes to banking and financial service... \n", + "9 Investors have been lulled by a sense of secur... \n", + "\n", + " cn:title cn:type \n", + "0 Meet the 33-year-old Canadian chemist and the ... cnbcnewsstory \n", + "1 Amazon raises investment in India to $26 billi... wirestory \n", + "2 Microsoft says it looked at acquiring Zynga bu... cnbcnewsstory \n", + "3 Apple's Tim Cook calls India 'huge opportunity... cnbcnewsstory \n", + "4 Tech CEOs meet President Biden and Indian PM M... cnbcvideo \n", + "5 Friday could offer a once-in-a-year chance to ... cnbcnewsstory \n", + "6 Here are Friday's biggest analyst calls: Meta,... cnbcnewsstory \n", + "7 CNBC Daily Open: Seeking shelter in tech cnbcnewsstory \n", + "8 A.I. has a discrimination problem. In banking,... cnbcnewsstory \n", + "9 CNBC Daily Open: Rate hikes and red lights cnbcnewsstory " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"datePublished\", \"cn:lastPubDate\", \"dateModified\", \"description\", \"section\" ,\"author\", \"summary\" , \"cn:title\", \"cn:type\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Yicai / 第一财经" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.yicai_streaming import Yicai_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = Yicai_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"茅台\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60, 13)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorchannelidcreationDatedescidpreviewImagesourcetagstitletopicstypeourlweight
010000032006-21 11:41101788593第一财经北斗星通;游资;龙虎;买入;通信机构抄底超讯通信 游资封板北斗星通丨龙虎榜10/news/101788593.html50
1[周艾琳]5306-20 21:552003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内...1017881832023/06/e42c4bda8cc367f523764c90447ab5a3.jpg第一财经外资;A股;基金;QFII;RQFII;瑞银QFII投资A股走过20年,外资驶向何方?10/news/101788183.html50
\n", + "
" + ], + "text/plain": [ + " author channelid creationDate \\\n", + "0 100000320 06-21 11:41 \n", + "1 [周艾琳] 53 06-20 21:55 \n", + "\n", + " desc id \\\n", + "0 101788593 \n", + "1 2003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内... 101788183 \n", + "\n", + " previewImage source \\\n", + "0 第一财经 \n", + "1 2023/06/e42c4bda8cc367f523764c90447ab5a3.jpg 第一财经 \n", + "\n", + " tags title topics typeo \\\n", + "0 北斗星通;游资;龙虎;买入;通信 机构抄底超讯通信 游资封板北斗星通丨龙虎榜 10 \n", + "1 外资;A股;基金;QFII;RQFII;瑞银 QFII投资A股走过20年,外资驶向何方? 10 \n", + "\n", + " url weight \n", + "0 /news/101788593.html 50 \n", + "1 /news/101788183.html 50 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
authorcreationDatedescsourcetitle
006-21 11:41第一财经机构抄底超讯通信 游资封板北斗星通丨龙虎榜
1[周艾琳]06-20 21:552003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内...第一财经QFII投资A股走过20年,外资驶向何方?
206-20 11:46第一财经北向资金抄底贵州<i>茅台</i> 游资联手封板中远海科丨龙虎榜
306-20 11:45第一财经22股获北向资金加仓超亿元
406-20 11:36第一财经北向资金抄底贵州<i>茅台</i> 游资联手封板中远海科丨龙虎榜
506-20 06:23第一财经每日早间精选热点新闻,点击「听新闻」,一键收听。第一财经布林肯结束访华,外交部美大司司长介绍情况;2023高考网上咨询周时间安排公布丨早报
6[第一财经]06-19 19:06今日股市0619丨50大跌小票指数强势 分化局面会否延续?
706-19 19:05第一财经今日股市0619丨50大跌小票指数强势 分化局面会否延续?
8[一财资讯]06-19 17:46净买入额居前三的是贵州<i>茅台</i>、药明康德、新易盛,分别获净买入3.48亿元、3.3...第一财经北向资金净卖出14.47亿元,贵州<i>茅台</i>、药明康德等获加仓
906-19 15:39第一财经三大指数小幅收跌 TMT赛道持续大涨|尾市盘点
\n", + "
" + ], + "text/plain": [ + " author creationDate desc \\\n", + "0 06-21 11:41 \n", + "1 [周艾琳] 06-20 21:55 2003年7月,第一家QFII瑞银宣布买入宝钢股份 、上港集箱等4只蓝筹公司股票,受到了国内... \n", + "2 06-20 11:46 \n", + "3 06-20 11:45 \n", + "4 06-20 11:36 \n", + "5 06-20 06:23 第一财经每日早间精选热点新闻,点击「听新闻」,一键收听。 \n", + "6 [第一财经] 06-19 19:06 \n", + "7 06-19 19:05 \n", + "8 [一财资讯] 06-19 17:46 净买入额居前三的是贵州茅台、药明康德、新易盛,分别获净买入3.48亿元、3.3... \n", + "9 06-19 15:39 \n", + "\n", + " source title \n", + "0 第一财经 机构抄底超讯通信 游资封板北斗星通丨龙虎榜 \n", + "1 第一财经 QFII投资A股走过20年,外资驶向何方? \n", + "2 第一财经 北向资金抄底贵州茅台 游资联手封板中远海科丨龙虎榜 \n", + "3 第一财经 22股获北向资金加仓超亿元 \n", + "4 第一财经 北向资金抄底贵州茅台 游资联手封板中远海科丨龙虎榜 \n", + "5 第一财经 布林肯结束访华,外交部美大司司长介绍情况;2023高考网上咨询周时间安排公布丨早报 \n", + "6 今日股市0619丨50大跌小票指数强势 分化局面会否延续? \n", + "7 第一财经 今日股市0619丨50大跌小票指数强势 分化局面会否延续? \n", + "8 第一财经 北向资金净卖出14.47亿元,贵州茅台、药明康德等获加仓 \n", + "9 第一财经 三大指数小幅收跌 TMT赛道持续大涨|尾市盘点 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"author\", \"creationDate\", \"desc\" ,\"source\", \"title\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Investor Place" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.investorplace_streaming import InvestorPlace_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = InvestorPlace_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimeauthorsummary
0[Trillion-Dollar Tech: 3 Stocks Poised for Unp...Jun 19, 2023Faisal Humayun, InvestorPlace ContributorThese are the tech stocks to buy for multibagg...
1[Trillion-Dollar Tech: 3 Stocks Poised for Unp...Jun 22, 2023Chris MacDonald, InvestorPlace ContributorWarren Buffett is undoubtedly one of the great...
2[Invest Like a Billionaire: 3 Long-Term Stocks...Jun 18, 2023Joel Baglole, InvestorPlace ContributorWith markets now recovering from the downturn ...
3[Invest Like a Billionaire: 3 Long-Term Stocks...Jun 16, 2023Louis Navellier and the InvestorPlace Research...The best tech stocks to watch are involved in ...
4[3 Tech Titans Leading the Charge Toward $10 T...Jun 19, 2023Will Ashworth, InvestorPlace ContributorAvoiding bad stocks requires investors to get ...
5[3 Tech Titans Leading the Charge Toward $10 T...Jun 19, 2023Tyrik Torres, InvestorPlace ContributorWhile AI software companies tend to get more b...
6[7 Tech Stocks to Watch Out For in 2023 … and ...Jun 16, 2023Chris MacDonald, InvestorPlace ContributorMany long-term conservative investors pay atte...
7[7 Tech Stocks to Watch Out For in 2023 … and ...Jun 16, 2023Louis Navellier and the InvestorPlace Research...Every stock has its ups and downs, but reliabl...
8[3 Smart Takes on 3 Dumb Stocks]Jun 23, 2023Samuel O'Brient, InvestorPlace Financial News ...Even as tech stocks rally, short sellers are s...
9[3 Smart Takes on 3 Dumb Stocks]Jun 18, 2023Chris Markoch, InvestorPlace ContributorHere are seven high cash flow stocks that prov...
\n", + "
" + ], + "text/plain": [ + " title time \\\n", + "0 [Trillion-Dollar Tech: 3 Stocks Poised for Unp... Jun 19, 2023 \n", + "1 [Trillion-Dollar Tech: 3 Stocks Poised for Unp... Jun 22, 2023 \n", + "2 [Invest Like a Billionaire: 3 Long-Term Stocks... Jun 18, 2023 \n", + "3 [Invest Like a Billionaire: 3 Long-Term Stocks... Jun 16, 2023 \n", + "4 [3 Tech Titans Leading the Charge Toward $10 T... Jun 19, 2023 \n", + "5 [3 Tech Titans Leading the Charge Toward $10 T... Jun 19, 2023 \n", + "6 [7 Tech Stocks to Watch Out For in 2023 … and ... Jun 16, 2023 \n", + "7 [7 Tech Stocks to Watch Out For in 2023 … and ... Jun 16, 2023 \n", + "8 [3 Smart Takes on 3 Dumb Stocks] Jun 23, 2023 \n", + "9 [3 Smart Takes on 3 Dumb Stocks] Jun 18, 2023 \n", + "\n", + " author \\\n", + "0 Faisal Humayun, InvestorPlace Contributor \n", + "1 Chris MacDonald, InvestorPlace Contributor \n", + "2 Joel Baglole, InvestorPlace Contributor \n", + "3 Louis Navellier and the InvestorPlace Research... \n", + "4 Will Ashworth, InvestorPlace Contributor \n", + "5 Tyrik Torres, InvestorPlace Contributor \n", + "6 Chris MacDonald, InvestorPlace Contributor \n", + "7 Louis Navellier and the InvestorPlace Research... \n", + "8 Samuel O'Brient, InvestorPlace Financial News ... \n", + "9 Chris Markoch, InvestorPlace Contributor \n", + "\n", + " summary \n", + "0 These are the tech stocks to buy for multibagg... \n", + "1 Warren Buffett is undoubtedly one of the great... \n", + "2 With markets now recovering from the downturn ... \n", + "3 The best tech stocks to watch are involved in ... \n", + "4 Avoiding bad stocks requires investors to get ... \n", + "5 While AI software companies tend to get more b... \n", + "6 Many long-term conservative investors pay atte... \n", + "7 Every stock has its ups and downs, but reliabl... \n", + "8 Even as tech stocks rally, short sellers are s... \n", + "9 Here are seven high cash flow stocks that prov... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\" ,\"author\", \"summary\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Guru Focus" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.gurufocus_streaming import GuruFocus_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support first page now!\n" + ] + } + ], + "source": [ + "news_downloader = GuruFocus_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"AAPL\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleviewsourcedatetime
03 Magic Formula Stocks Popular With Gurus0 ViewsMargaret Moran2023-06-23 17:38
1Jeremy Grantham: The Super Bubble Is About to Pop60 ViewsBen Alaimo2023-06-23 09:21
25 High GF Score Stocks That Outperformed the M...106 ViewsJames Li2023-06-21 19:36
3New Feature: DuPont Analysis Chart for Enhance...259 ViewsVera Yuan2023-06-21 16:55
4The Most-Sold Guru Stocks of the 1st Quarter261 ViewsMargaret Moran2023-06-16 17:32
5AI Revolution and Debt Ceiling Resolution198 ViewsWade W. Slome, CFA, CFP2023-06-05 21:03
6Nvidia vs. ARK Invest: Which Is the Better Gro...332 ViewsJoey Frenette2023-05-27 02:05
7Top 5 1st Quarter Trades of CYPRESS ASSET MANA...0 ViewsGuruFocus Editor2023-05-26 14:08
8Mill Creek Capital Advisors, LLC Buys 2, Sells...0 ViewsGuruFocus Editor2023-05-25 18:10
9Jim Simons' Renaissance Technologies Chops Pos...380 ViewsJames Li2023-05-24 18:43
\n", + "
" + ], + "text/plain": [ + " title view \\\n", + "0 3 Magic Formula Stocks Popular With Gurus 0 Views \n", + "1 Jeremy Grantham: The Super Bubble Is About to Pop 60 Views \n", + "2 5 High GF Score Stocks That Outperformed the M... 106 Views \n", + "3 New Feature: DuPont Analysis Chart for Enhance... 259 Views \n", + "4 The Most-Sold Guru Stocks of the 1st Quarter 261 Views \n", + "5 AI Revolution and Debt Ceiling Resolution 198 Views \n", + "6 Nvidia vs. ARK Invest: Which Is the Better Gro... 332 Views \n", + "7 Top 5 1st Quarter Trades of CYPRESS ASSET MANA... 0 Views \n", + "8 Mill Creek Capital Advisors, LLC Buys 2, Sells... 0 Views \n", + "9 Jim Simons' Renaissance Technologies Chops Pos... 380 Views \n", + "\n", + " source datetime \n", + "0 Margaret Moran 2023-06-23 17:38 \n", + "1 Ben Alaimo 2023-06-23 09:21 \n", + "2 James Li 2023-06-21 19:36 \n", + "3 Vera Yuan 2023-06-21 16:55 \n", + "4 Margaret Moran 2023-06-16 17:32 \n", + "5 Wade W. Slome, CFA, CFP 2023-06-05 21:03 \n", + "6 Joey Frenette 2023-05-27 02:05 \n", + "7 GuruFocus Editor 2023-05-26 14:08 \n", + "8 GuruFocus Editor 2023-05-25 18:10 \n", + "9 James Li 2023-05-24 18:43 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"view\" ,\"source\", \"datetime\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Alliance News" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.alliancenews_streaming import AllianceNews_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "news_downloader = AllianceNews_Streaming()\n", + "news_downloader.download_streaming_search(rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(36, 16)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urlIdtitlesummarycreatedupdatedthumbnailUrlsourcetaxonomiestypeauthormetasponsorparentcontentIddisplayTaxonomiesparentTaxonomy
0/news/new-york-market-close-stocks-down-dollar...NEW YORK MARKET CLOSE: Stocks down, dollar up ...None2023-06-23T21:18:342023-06-23T21:18:34None{'code': 'ALLIANCE', 'title': 'Alliance News',...[{'termId': 'CTMRR', 'parentTermId': None, 'ti...newsNone{'title': 'NEW YORK MARKET CLOSE: Stocks down,...NoneNoneal1687551514259519100[{'termId': 'TPCOM', 'parentTermId': 'PTMKT', ...None
\n", + "
" + ], + "text/plain": [ + " urlId \\\n", + "0 /news/new-york-market-close-stocks-down-dollar... \n", + "\n", + " title summary \\\n", + "0 NEW YORK MARKET CLOSE: Stocks down, dollar up ... None \n", + "\n", + " created updated thumbnailUrl \\\n", + "0 2023-06-23T21:18:34 2023-06-23T21:18:34 None \n", + "\n", + " source \\\n", + "0 {'code': 'ALLIANCE', 'title': 'Alliance News',... \n", + "\n", + " taxonomies type author \\\n", + "0 [{'termId': 'CTMRR', 'parentTermId': None, 'ti... news None \n", + "\n", + " meta sponsor parent \\\n", + "0 {'title': 'NEW YORK MARKET CLOSE: Stocks down,... None None \n", + "\n", + " contentId displayTaxonomies \\\n", + "0 al1687551514259519100 [{'termId': 'TPCOM', 'parentTermId': 'PTMKT', ... \n", + "\n", + " parentTaxonomy \n", + "0 None " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
createdupdatedtitlesummarymeta
02023-06-23T21:18:342023-06-23T21:18:34NEW YORK MARKET CLOSE: Stocks down, dollar up ...None{'title': 'NEW YORK MARKET CLOSE: Stocks down,...
12023-06-23T19:34:052023-06-23T19:34:05IN BRIEF: Blackstone Loan Financing proposes w...None{'title': 'IN BRIEF: Blackstone Loan Financing...
22023-06-23T18:34:412023-06-23T18:34:41IN BRIEF: Bonhill expects to complete sale of ...None{'title': 'IN BRIEF: Bonhill expects to comple...
32023-06-23T18:01:272023-06-23T18:01:27UPDATE: SRT Marine Systems raises GBP4.6 milli...None{'title': 'UPDATE: SRT Marine Systems raises G...
42023-06-23T18:00:272023-06-23T18:00:27IN BRIEF: New Energy One Acquisition confirms ...None{'title': 'IN BRIEF: New Energy One Acquisitio...
52023-06-23T17:41:152023-06-23T17:41:15IN BRIEF: Kropz makes draw down request on bri...None{'title': 'IN BRIEF: Kropz makes draw down req...
62023-06-23T17:31:172023-06-23T17:31:17IN BRIEF: XPS Pensions discusses National Pens...None{'title': 'IN BRIEF: XPS Pensions discusses Na...
72023-06-23T17:25:542023-06-23T17:25:54DIRECTOR DEALINGS: GSK CFO buys shares worth G...None{'title': 'DIRECTOR DEALINGS: GSK CFO buys sha...
82023-06-23T17:21:292023-06-23T17:21:29IN BRIEF: Gilead Sciences says test results sh...None{'title': 'IN BRIEF: Gilead Sciences says test...
92023-06-23T17:07:242023-06-23T17:07:24IN THE KNOW: AB Foods \"fundamentally strong\" w...None{'title': 'IN THE KNOW: AB Foods \"fundamentall...
\n", + "
" + ], + "text/plain": [ + " created updated \\\n", + "0 2023-06-23T21:18:34 2023-06-23T21:18:34 \n", + "1 2023-06-23T19:34:05 2023-06-23T19:34:05 \n", + "2 2023-06-23T18:34:41 2023-06-23T18:34:41 \n", + "3 2023-06-23T18:01:27 2023-06-23T18:01:27 \n", + "4 2023-06-23T18:00:27 2023-06-23T18:00:27 \n", + "5 2023-06-23T17:41:15 2023-06-23T17:41:15 \n", + "6 2023-06-23T17:31:17 2023-06-23T17:31:17 \n", + "7 2023-06-23T17:25:54 2023-06-23T17:25:54 \n", + "8 2023-06-23T17:21:29 2023-06-23T17:21:29 \n", + "9 2023-06-23T17:07:24 2023-06-23T17:07:24 \n", + "\n", + " title summary \\\n", + "0 NEW YORK MARKET CLOSE: Stocks down, dollar up ... None \n", + "1 IN BRIEF: Blackstone Loan Financing proposes w... None \n", + "2 IN BRIEF: Bonhill expects to complete sale of ... None \n", + "3 UPDATE: SRT Marine Systems raises GBP4.6 milli... None \n", + "4 IN BRIEF: New Energy One Acquisition confirms ... None \n", + "5 IN BRIEF: Kropz makes draw down request on bri... None \n", + "6 IN BRIEF: XPS Pensions discusses National Pens... None \n", + "7 DIRECTOR DEALINGS: GSK CFO buys shares worth G... None \n", + "8 IN BRIEF: Gilead Sciences says test results sh... None \n", + "9 IN THE KNOW: AB Foods \"fundamentally strong\" w... None \n", + "\n", + " meta \n", + "0 {'title': 'NEW YORK MARKET CLOSE: Stocks down,... \n", + "1 {'title': 'IN BRIEF: Blackstone Loan Financing... \n", + "2 {'title': 'IN BRIEF: Bonhill expects to comple... \n", + "3 {'title': 'UPDATE: SRT Marine Systems raises G... \n", + "4 {'title': 'IN BRIEF: New Energy One Acquisitio... \n", + "5 {'title': 'IN BRIEF: Kropz makes draw down req... \n", + "6 {'title': 'IN BRIEF: XPS Pensions discusses Na... \n", + "7 {'title': 'DIRECTOR DEALINGS: GSK CFO buys sha... \n", + "8 {'title': 'IN BRIEF: Gilead Sciences says test... \n", + "9 {'title': 'IN THE KNOW: AB Foods \"fundamentall... " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"created\", \"updated\", \"title\", \"summary\", \"meta\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Talk Market" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.talkmarkets_streaming import TalkMarkets_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading... 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = TalkMarkets_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60, 12)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cacheUrlclicktrackUrlcontentcontentNoFormattingtitletitleNoFormattingformattedUrlunescapedUrlurlvisibleUrlrichSnippetbreadcrumbUrl
0http://www.google.com/search?q=cache:PUjyIRJA8...https://www.google.com/url?client=internal-ele...23 hours ago <b>...</b> <b>Apple</b>, 187, 3.0...23 hours ago ... Apple, 187, 3.04, 1.65%, 187....Equitymaster India | Sensex Today Trades Lower...Equitymaster India | Sensex Today Trades Lower...https://talkmarkets.com/.../sensex-today-trade...https://talkmarkets.com/content/global-markets...https://talkmarkets.com/content/global-markets...talkmarkets.com{'cseImage': {'src': 'https://www.eqimg.com/im...{'host': 'talkmarkets.com', 'crumbs': ['sensex...
\n", + "
" + ], + "text/plain": [ + " cacheUrl \\\n", + "0 http://www.google.com/search?q=cache:PUjyIRJA8... \n", + "\n", + " clicktrackUrl \\\n", + "0 https://www.google.com/url?client=internal-ele... \n", + "\n", + " content \\\n", + "0 23 hours ago ... Apple, 187, 3.0... \n", + "\n", + " contentNoFormatting \\\n", + "0 23 hours ago ... Apple, 187, 3.04, 1.65%, 187.... \n", + "\n", + " title \\\n", + "0 Equitymaster India | Sensex Today Trades Lower... \n", + "\n", + " titleNoFormatting \\\n", + "0 Equitymaster India | Sensex Today Trades Lower... \n", + "\n", + " formattedUrl \\\n", + "0 https://talkmarkets.com/.../sensex-today-trade... \n", + "\n", + " unescapedUrl \\\n", + "0 https://talkmarkets.com/content/global-markets... \n", + "\n", + " url visibleUrl \\\n", + "0 https://talkmarkets.com/content/global-markets... talkmarkets.com \n", + "\n", + " richSnippet \\\n", + "0 {'cseImage': {'src': 'https://www.eqimg.com/im... \n", + "\n", + " breadcrumbUrl \n", + "0 {'host': 'talkmarkets.com', 'crumbs': ['sensex... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contenturlclicktrackUrl
023 hours ago <b>...</b> <b>Apple</b>, 187, 3.0...https://talkmarkets.com/content/global-markets...https://www.google.com/url?client=internal-ele...
11 day ago <b>...</b> Get Adobe Inc. (ADBE:NASD...https://talkmarkets.com/symbol/adbe/portal-wid...https://www.google.com/url?client=internal-ele...
21 day ago <b>...</b> Get Starbucks Corp (SBUX:...https://talkmarkets.com/symbol/sbux/portal-wid...https://www.google.com/url?client=internal-ele...
310 hours ago <b>...</b> Wednesday&#39;s top an...https://talkmarkets.com/symbol/pypl/portal-wid...https://www.google.com/url?client=internal-ele...
420 hours ago <b>...</b> <b>Apple</b> (AAPL). <...https://talkmarkets.com/content/stocks--equiti...https://www.google.com/url?client=internal-ele...
52 days ago <b>...</b> Friday&#39;s top analyst...https://talkmarkets.com/symbol/sofi/portal-wid...https://www.google.com/url?client=internal-ele...
62 days ago <b>...</b> Get Enphase Energy Inc (...https://talkmarkets.com/symbol/enph/portal-wid...https://www.google.com/url?client=internal-ele...
73 days ago <b>...</b> <b>Apple</b> Inc. design...https://talkmarkets.com/contributor/jimvanmeer...https://www.google.com/url?client=internal-ele...
83 days ago <b>...</b> <b>Apple</b> Inc. design...https://talkmarkets.com/content/stocks--equiti...https://www.google.com/url?client=internal-ele...
95 Jan 2023 <b>...</b> Get Amazon.com Inc (AMZN...https://talkmarkets.com/symbol/amzn/portal-wid...https://www.google.com/url?client=internal-ele...
\n", + "
" + ], + "text/plain": [ + " content \\\n", + "0 23 hours ago ... Apple, 187, 3.0... \n", + "1 1 day ago ... Get Adobe Inc. (ADBE:NASD... \n", + "2 1 day ago ... Get Starbucks Corp (SBUX:... \n", + "3 10 hours ago ... Wednesday's top an... \n", + "4 20 hours ago ... Apple (AAPL). <... \n", + "5 2 days ago ... Friday's top analyst... \n", + "6 2 days ago ... Get Enphase Energy Inc (... \n", + "7 3 days ago ... Apple Inc. design... \n", + "8 3 days ago ... Apple Inc. design... \n", + "9 5 Jan 2023 ... Get Amazon.com Inc (AMZN... \n", + "\n", + " url \\\n", + "0 https://talkmarkets.com/content/global-markets... \n", + "1 https://talkmarkets.com/symbol/adbe/portal-wid... \n", + "2 https://talkmarkets.com/symbol/sbux/portal-wid... \n", + "3 https://talkmarkets.com/symbol/pypl/portal-wid... \n", + "4 https://talkmarkets.com/content/stocks--equiti... \n", + "5 https://talkmarkets.com/symbol/sofi/portal-wid... \n", + "6 https://talkmarkets.com/symbol/enph/portal-wid... \n", + "7 https://talkmarkets.com/contributor/jimvanmeer... \n", + "8 https://talkmarkets.com/content/stocks--equiti... \n", + "9 https://talkmarkets.com/symbol/amzn/portal-wid... \n", + "\n", + " clicktrackUrl \n", + "0 https://www.google.com/url?client=internal-ele... \n", + "1 https://www.google.com/url?client=internal-ele... \n", + "2 https://www.google.com/url?client=internal-ele... \n", + "3 https://www.google.com/url?client=internal-ele... \n", + "4 https://www.google.com/url?client=internal-ele... \n", + "5 https://www.google.com/url?client=internal-ele... \n", + "6 https://www.google.com/url?client=internal-ele... \n", + "7 https://www.google.com/url?client=internal-ele... \n", + "8 https://www.google.com/url?client=internal-ele... \n", + "9 https://www.google.com/url?client=internal-ele... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"content\", \"url\", \"clicktrackUrl\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The Fly" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.thefly_streaming import TheFly_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\ProgramData\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'thefly.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = TheFly_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"AAPL\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlestockabstractdatetime
0Apple in talks to launch Apple Card in India, ...AAPLApple is in talks to…06/23/2305:37
1Apple says visionOS software development kit n...AAPLApple announced the…06/21/2316:03
2Apple to create spatial experiences for Apple ...AAPLApple \"announced the…06/21/2316:00
3Notable open interest changes for June 21stTSLA NVDA AAPL AMZNTuesday's total…06/21/2308:55
4What You Missed This Week in Video GamesTCEHY TTWO EA CCOEY UBSFY CMCSK CMCSA RBLX AAP...\"Game On\" is The Fly's…06/20/2312:11
5Notable open interest changes for June 20thTSLA AMC AAPL NVDAFriday's total…06/20/2308:55
6Apple call buyer realizes 20% same-day gainsAAPLNotable profits for the…06/16/2308:00
7Notable open interest changes for June 15thTSLA NVDA SOFI AAPLWednesday's total…06/15/2308:55
8US senators propose bill to eliminate Section ...GOOG MSFT AMZN AAPL NVDA IBM META INTC\"U.S. Senators Josh…06/14/2317:54
9#SocialStocks: Twitter skips out on rent and G...TWTR AAPL META GOOGL GOOG ZM RBLX PINSWelcome to…06/14/2315:57
\n", + "
" + ], + "text/plain": [ + " title ... time\n", + "0 Apple in talks to launch Apple Card in India, ... ... 05:37\n", + "1 Apple says visionOS software development kit n... ... 16:03\n", + "2 Apple to create spatial experiences for Apple ... ... 16:00\n", + "3 Notable open interest changes for June 21st ... 08:55\n", + "4 What You Missed This Week in Video Games ... 12:11\n", + "5 Notable open interest changes for June 20th ... 08:55\n", + "6 Apple call buyer realizes 20% same-day gains ... 08:00\n", + "7 Notable open interest changes for June 15th ... 08:55\n", + "8 US senators propose bill to eliminate Section ... ... 17:54\n", + "9 #SocialStocks: Twitter skips out on rent and G... ... 15:57\n", + "\n", + "[10 rows x 5 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"stock\", \"abstract\", \"date\", \"time\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tip Rank" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.tipranks_streaming import TipRanks_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading: 0 1 2 " + ] + } + ], + "source": [ + "news_downloader = TipRanks_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stocks_idauthorcategorydatedescriptionimageisLockedlinklockTypeslugstickythumbnailtitletopicstimeAgobadgeid
0[{'ticker': 'AAPL', 'market': None}]582530{'slug': 'steveanderson'}{'slug': 'news', 'title': 'Market News'}2023-06-22T19:52:41.000Z<p>Those who regularly follow Apple stock (NAS...{'src': 'https://blog.tipranks.com/wp-content/...Truehttps://www.tipranks.com/news/aapl-notches-up-...GraceCountaapl-notches-up-following-barclays-commentsFalse{'src': 'https://blog.tipranks.com/wp-content/...AAPL Notches Up Following Barclays Comments[{'id': 0, 'type': 'stock', 'title': 'AAPL', '...13hNone582530
1[{'ticker': 'AAPL', 'market': None}, {'ticker'...579043{'slug': 'amit-singh'}{'slug': 'article', 'title': 'Stock Analysis &...2023-06-19T15:30:38.000Z<p>Affirm Holdings (NASDAQ:AFRM) stock recover...{'src': 'https://blog.tipranks.com/wp-content/...Truehttps://www.tipranks.com/news/article/affirm-s...GraceCountaffirm-stock-has-risen-swiftly-will-it-beat-ap...False{'src': 'https://blog.tipranks.com/wp-content/...Affirm Stock Has Risen Swiftly. Will It Beat A...[{'id': 0, 'type': 'stock', 'title': 'AAPL', '...4dNone579043
\n", + "
" + ], + "text/plain": [ + " stocks _id \\\n", + "0 [{'ticker': 'AAPL', 'market': None}] 582530 \n", + "1 [{'ticker': 'AAPL', 'market': None}, {'ticker'... 579043 \n", + "\n", + " author \\\n", + "0 {'slug': 'steveanderson'} \n", + "1 {'slug': 'amit-singh'} \n", + "\n", + " category \\\n", + "0 {'slug': 'news', 'title': 'Market News'} \n", + "1 {'slug': 'article', 'title': 'Stock Analysis &... \n", + "\n", + " date \\\n", + "0 2023-06-22T19:52:41.000Z \n", + "1 2023-06-19T15:30:38.000Z \n", + "\n", + " description \\\n", + "0

Those who regularly follow Apple stock (NAS... \n", + "1

Affirm Holdings (NASDAQ:AFRM) stock recover... \n", + "\n", + " image isLocked \\\n", + "0 {'src': 'https://blog.tipranks.com/wp-content/... True \n", + "1 {'src': 'https://blog.tipranks.com/wp-content/... True \n", + "\n", + " link lockType \\\n", + "0 https://www.tipranks.com/news/aapl-notches-up-... GraceCount \n", + "1 https://www.tipranks.com/news/article/affirm-s... GraceCount \n", + "\n", + " slug sticky \\\n", + "0 aapl-notches-up-following-barclays-comments False \n", + "1 affirm-stock-has-risen-swiftly-will-it-beat-ap... False \n", + "\n", + " thumbnail \\\n", + "0 {'src': 'https://blog.tipranks.com/wp-content/... \n", + "1 {'src': 'https://blog.tipranks.com/wp-content/... \n", + "\n", + " title \\\n", + "0 AAPL Notches Up Following Barclays Comments \n", + "1 Affirm Stock Has Risen Swiftly. Will It Beat A... \n", + "\n", + " topics timeAgo badge id \n", + "0 [{'id': 0, 'type': 'stock', 'title': 'AAPL', '... 13h None 582530 \n", + "1 [{'id': 0, 'type': 'stock', 'title': 'AAPL', '... 4d None 579043 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stocksdateauthortitledescription
0[{'ticker': 'AAPL', 'market': None}]2023-06-22T19:52:41.000Z{'slug': 'steveanderson'}AAPL Notches Up Following Barclays Comments<p>Those who regularly follow Apple stock (NAS...
1[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-19T15:30:38.000Z{'slug': 'amit-singh'}Affirm Stock Has Risen Swiftly. Will It Beat A...<p>Affirm Holdings (NASDAQ:AFRM) stock recover...
2[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-16T20:51:11.000Z{'slug': 'joey-frenette'}Apple Stock (NASDAQ:AAPL): Expectations Too Mo...<p>Apple (NASDAQ:AAPL) stock recently hit a ne...
3[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-16T08:32:40.000Z{'slug': 'sheryl-sheth'}Lost the Nvidia and Apple Boom? Microsoft (NAS...<p>Think you lost the chance to become wealthy...
4[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-15T19:20:48.000Z{'slug': 'joey-frenette'}Unity Software (NASDAQ:U): Apple Vision Pro Pa...<p>Unity Software (NASDAQ:U) rallied 17% when ...
5[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-15T00:53:16.000Z{'slug': 'joey-frenette'}Investing in Apple’s (NASDAQ:AAPL) Ecosystem: ...<p>Apple (NASDAQ:AAPL) has been on an unbeliev...
6[{'ticker': 'AAPL', 'market': None}]2023-06-13T15:13:40.000Z{'slug': 'vince-condarcuri'}AAPL Stock Slips after Analyst Downgrade<p>Despite outperforming the S&amp;P 500 with ...
7[{'ticker': 'AAPL', 'market': None}]2023-06-10T15:09:23.000Z{'slug': 'martyshtrubel'}Apple Stock Gets a New Street-High Price Target<p>Apple’s (NASDAQ:AAPL) virtual reality and a...
8[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-09T09:46:28.000Z{'slug': 'amit-singh'}NVDA to META: Insiders Capitalise on Tech Stoc...<p>Technology stocks rebounded strongly in 202...
9[{'ticker': 'AAPL', 'market': None}, {'ticker'...2023-06-08T18:20:20.000Z{'slug': 'michaelbyrne'}Apple Stock is on Fire. Invest in it with Thes...<p>Apple (NASDAQ:AAPL) stock is off to a gain ...
\n", + "
" + ], + "text/plain": [ + " stocks \\\n", + "0 [{'ticker': 'AAPL', 'market': None}] \n", + "1 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "2 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "3 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "4 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "5 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "6 [{'ticker': 'AAPL', 'market': None}] \n", + "7 [{'ticker': 'AAPL', 'market': None}] \n", + "8 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "9 [{'ticker': 'AAPL', 'market': None}, {'ticker'... \n", + "\n", + " date author \\\n", + "0 2023-06-22T19:52:41.000Z {'slug': 'steveanderson'} \n", + "1 2023-06-19T15:30:38.000Z {'slug': 'amit-singh'} \n", + "2 2023-06-16T20:51:11.000Z {'slug': 'joey-frenette'} \n", + "3 2023-06-16T08:32:40.000Z {'slug': 'sheryl-sheth'} \n", + "4 2023-06-15T19:20:48.000Z {'slug': 'joey-frenette'} \n", + "5 2023-06-15T00:53:16.000Z {'slug': 'joey-frenette'} \n", + "6 2023-06-13T15:13:40.000Z {'slug': 'vince-condarcuri'} \n", + "7 2023-06-10T15:09:23.000Z {'slug': 'martyshtrubel'} \n", + "8 2023-06-09T09:46:28.000Z {'slug': 'amit-singh'} \n", + "9 2023-06-08T18:20:20.000Z {'slug': 'michaelbyrne'} \n", + "\n", + " title \\\n", + "0 AAPL Notches Up Following Barclays Comments \n", + "1 Affirm Stock Has Risen Swiftly. Will It Beat A... \n", + "2 Apple Stock (NASDAQ:AAPL): Expectations Too Mo... \n", + "3 Lost the Nvidia and Apple Boom? Microsoft (NAS... \n", + "4 Unity Software (NASDAQ:U): Apple Vision Pro Pa... \n", + "5 Investing in Apple’s (NASDAQ:AAPL) Ecosystem: ... \n", + "6 AAPL Stock Slips after Analyst Downgrade \n", + "7 Apple Stock Gets a New Street-High Price Target \n", + "8 NVDA to META: Insiders Capitalise on Tech Stoc... \n", + "9 Apple Stock is on Fire. Invest in it with Thes... \n", + "\n", + " description \n", + "0

Those who regularly follow Apple stock (NAS... \n", + "1

Affirm Holdings (NASDAQ:AFRM) stock recover... \n", + "2

Apple (NASDAQ:AAPL) stock recently hit a ne... \n", + "3

Think you lost the chance to become wealthy... \n", + "4

Unity Software (NASDAQ:U) rallied 17% when ... \n", + "5

Apple (NASDAQ:AAPL) has been on an unbeliev... \n", + "6

Despite outperforming the S&P 500 with ... \n", + "7

Apple’s (NASDAQ:AAPL) virtual reality and a... \n", + "8

Technology stocks rebounded strongly in 202... \n", + "9

Apple (NASDAQ:AAPL) stock is off to a gain ... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"stocks\", \"date\", \"author\", \"title\", \"description\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Market Watch (Date Range)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.marketwatch_date_range import MarketWatch_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2022-06-01\"\n", + "end_date = \"2022-06-30\"\n", + "keyword = \"apple\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = MarketWatch_Date_Range()\n", + "news_downloader.download_date_range_search(keyword = \"apple\", start_date = start_date, end_date = end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimeauthor
0Gold falls more than 2% for the month, settles...Jun. 30, 2022 at 2:47 p.m. ETby Joseph Adinolfi
1AMD stock gets an upgrade as analyst says rece...Jun. 30, 2022 at 12:07 p.m. ETby Emily Bary
2All 30 Dow stocks are falling, with Goldman Sa...Jun. 30, 2022 at 9:47 a.m. ETby Tomi Kilgore
3Here’s how far oil could fall in a recession, ...Jun. 30, 2022 at 8:34 a.m. ETby Steve Goldstein
4Crypto Winter Is Coming After SEC Rejects Key ...Jun. 30, 2022 at 6:37 a.m. ET
5An FCC Commissioner Wants TikTok Yanked From A...Jun. 30, 2022 at 3:27 a.m. ETby Barron's
6Meta Has a New Problem. Profit Forecasts Now L...Jun. 29, 2022 at 1:18 p.m. ETby Barron's
7Fed rolls out new index to flag early warning ...Jun. 29, 2022 at 1:04 p.m. ETby Joy Wiltermuth
8Apple Investors Have Something New to Focus On...Jun. 29, 2022 at 12:41 p.m. ETby Barron's
9Here’s why this trader is piling back into one...Jun. 29, 2022 at 10:34 a.m. ETby Barbara Kollmeyer
\n", + "
" + ], + "text/plain": [ + " title \\\n", + "0 Gold falls more than 2% for the month, settles... \n", + "1 AMD stock gets an upgrade as analyst says rece... \n", + "2 All 30 Dow stocks are falling, with Goldman Sa... \n", + "3 Here’s how far oil could fall in a recession, ... \n", + "4 Crypto Winter Is Coming After SEC Rejects Key ... \n", + "5 An FCC Commissioner Wants TikTok Yanked From A... \n", + "6 Meta Has a New Problem. Profit Forecasts Now L... \n", + "7 Fed rolls out new index to flag early warning ... \n", + "8 Apple Investors Have Something New to Focus On... \n", + "9 Here’s why this trader is piling back into one... \n", + "\n", + " time author \n", + "0 Jun. 30, 2022 at 2:47 p.m. ET by Joseph Adinolfi \n", + "1 Jun. 30, 2022 at 12:07 p.m. ET by Emily Bary \n", + "2 Jun. 30, 2022 at 9:47 a.m. ET by Tomi Kilgore \n", + "3 Jun. 30, 2022 at 8:34 a.m. ET by Steve Goldstein \n", + "4 Jun. 30, 2022 at 6:37 a.m. ET \n", + "5 Jun. 30, 2022 at 3:27 a.m. ET by Barron's \n", + "6 Jun. 29, 2022 at 1:18 p.m. ET by Barron's \n", + "7 Jun. 29, 2022 at 1:04 p.m. ET by Joy Wiltermuth \n", + "8 Jun. 29, 2022 at 12:41 p.m. ET by Barron's \n", + "9 Jun. 29, 2022 at 10:34 a.m. ET by Barbara Kollmeyer " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\", \"author\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Market Watch (Streaming)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.marketwatch_streaming import MarketWatch_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = MarketWatch_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimeauthor
0Tech IPOs Should Be Heating Up. Why They’re Not.Jun. 23, 2023 at 2:51 a.m. ETby Barron's
1Everything Is Going Right for Tesla. It’s Time...Jun. 23, 2023 at 1:30 a.m. ETby Barron's
2India’s Modi cracks jokes, chows down at swank...Jun. 22, 2023 at 11:38 p.m. ETby Associated Press
3Work-From-Home Job Openings Are ShrinkingJun. 22, 2023 at 6:23 p.m. ETby Barron's
4Congress Blasts E-Commerce Firm Temu Over Forc...Jun. 22, 2023 at 5:44 p.m. ETby Barron's
5Meta Platforms Inc. stock outperforms market o...Jun. 22, 2023 at 5:32 p.m. ETby MarketWatch Automation
6Microsoft Corp. stock outperforms competitors ...Jun. 22, 2023 at 5:32 p.m. ETby MarketWatch Automation
7Netflix Inc. stock underperforms Thursday when...Jun. 22, 2023 at 5:32 p.m. ETby MarketWatch Automation
8GameStop Corp. Cl A stock underperforms Thursd...Jun. 22, 2023 at 5:29 p.m. ETby MarketWatch Automation
9Amazon.com Inc. stock outperforms market on st...Jun. 22, 2023 at 5:25 p.m. ETby MarketWatch Automation
\n", + "
" + ], + "text/plain": [ + " title \\\n", + "0 Tech IPOs Should Be Heating Up. Why They’re Not. \n", + "1 Everything Is Going Right for Tesla. It’s Time... \n", + "2 India’s Modi cracks jokes, chows down at swank... \n", + "3 Work-From-Home Job Openings Are Shrinking \n", + "4 Congress Blasts E-Commerce Firm Temu Over Forc... \n", + "5 Meta Platforms Inc. stock outperforms market o... \n", + "6 Microsoft Corp. stock outperforms competitors ... \n", + "7 Netflix Inc. stock underperforms Thursday when... \n", + "8 GameStop Corp. Cl A stock underperforms Thursd... \n", + "9 Amazon.com Inc. stock outperforms market on st... \n", + "\n", + " time author \n", + "0 Jun. 23, 2023 at 2:51 a.m. ET by Barron's \n", + "1 Jun. 23, 2023 at 1:30 a.m. ET by Barron's \n", + "2 Jun. 22, 2023 at 11:38 p.m. ET by Associated Press \n", + "3 Jun. 22, 2023 at 6:23 p.m. ET by Barron's \n", + "4 Jun. 22, 2023 at 5:44 p.m. ET by Barron's \n", + "5 Jun. 22, 2023 at 5:32 p.m. ET by MarketWatch Automation \n", + "6 Jun. 22, 2023 at 5:32 p.m. ET by MarketWatch Automation \n", + "7 Jun. 22, 2023 at 5:32 p.m. ET by MarketWatch Automation \n", + "8 Jun. 22, 2023 at 5:29 p.m. ET by MarketWatch Automation \n", + "9 Jun. 22, 2023 at 5:25 p.m. ET by MarketWatch Automation " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\", \"author\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Penny Stock" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.pennystocks_streaming import PennyStocks_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requesting https://pennystocks.com ... succeed!\n", + "Gathering again .. Remaining Retry: 4\n", + "Only support the first page now!\n" + ] + } + ], + "source": [ + "news_downloader = PennyStocks_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titletimebriefreading_time
0Best Penny Stocks to Buy Ahead Of Apple’s Even...September 14, 2021\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat do penny stock in...5 minute read
0What Could The Apple Event Mean For Penny Stoc...October 13, 2020\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWill The Apple Event M...5 minute read
03 Red Hot Penny Stocks To Watch Before Next We...June 14, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch.\\n4 minute read
0Penny Stocks Definition & 7 Trading Strategies...June 8, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat Is A Penny Stock?...6 minute read
0Best Penny Stocks To Buy? 5 With Big News This...June 7, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch ...4 minute read
0Penny Stocks & The Stock Market Today: Top Tre...May 30, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat happened in the s...6 minute read
0Penny Stocks To Buy? 3 AI Stocks To Watch Righ...May 30, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tAI penny stocks to wat...5 minute read
0What Are Penny Stocks & Should You Buy Them In...May 19, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tShould You Find Penny ...6 minute read
0Trading Penny Stocks: 3 High-Growth Industries...May 8, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWatch these three indu...7 minute read
0Fed Meeting Live Updates: 10 Takeaways From Ma...May 3, 2023\\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tFOMC Statement From Ma...10 minute read
\n", + "
" + ], + "text/plain": [ + " title time \\\n", + "0 Best Penny Stocks to Buy Ahead Of Apple’s Even... September 14, 2021 \n", + "0 What Could The Apple Event Mean For Penny Stoc... October 13, 2020 \n", + "0 3 Red Hot Penny Stocks To Watch Before Next We... June 14, 2023 \n", + "0 Penny Stocks Definition & 7 Trading Strategies... June 8, 2023 \n", + "0 Best Penny Stocks To Buy? 5 With Big News This... June 7, 2023 \n", + "0 Penny Stocks & The Stock Market Today: Top Tre... May 30, 2023 \n", + "0 Penny Stocks To Buy? 3 AI Stocks To Watch Righ... May 30, 2023 \n", + "0 What Are Penny Stocks & Should You Buy Them In... May 19, 2023 \n", + "0 Trading Penny Stocks: 3 High-Growth Industries... May 8, 2023 \n", + "0 Fed Meeting Live Updates: 10 Takeaways From Ma... May 3, 2023 \n", + "\n", + " brief reading_time \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat do penny stock in... 5 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWill The Apple Event M... 5 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch.\\n 4 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat Is A Penny Stock?... 6 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tPenny stocks to watch ... 4 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWhat happened in the s... 6 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tAI penny stocks to wat... 5 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tShould You Find Penny ... 6 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tWatch these three indu... 7 minute read \n", + "0 \\r\\n\\r\\n\\t\\t\\t\\r\\n\\t\\t\\tFOMC Statement From Ma... 10 minute read " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"time\", \"brief\", \"reading_time\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Seeking Alpha" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.seekingalpha_date_range import SeekingAlpha_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2023-06-01\"\n", + "end_date = \"2023-06-30\"\n", + "stock = \"AAPL\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading Titles: 100%|██████████| 1/1 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
publishOntitlecommentCount
02023-06-19T09:00:00-04:00Artificial intelligence is a '1995 moment' for...63
12023-06-16T11:59:46-04:00Citi: Don't worry about Big Tech fueling 2023'...17
22023-06-15T07:20:12-04:00Google said to temper chatbot use for employee...8
32023-06-14T05:17:31-04:00Nvidia crosses $1T market cap powered by the r...36
42023-06-13T17:17:00-04:00Intel in talks to be anchor investor in chip d...94
52023-06-13T12:14:45-04:00Hot Stocks: AAPL falls on downgrade; MANU rise...3
62023-06-13T06:22:59-04:00Apple notches record close as bulls continue t...18
72023-06-13T04:49:26-04:00Apple cut to Neutral at UBS on softer iPhone a...26
82023-06-09T05:42:17-04:00AI looking like a 'winner-take-more' game - Go...39
92023-06-09T05:21:04-04:00Zuckerberg's vision for AR/VR headsets differe...92
\n", + "" + ], + "text/plain": [ + " publishOn \\\n", + "0 2023-06-19T09:00:00-04:00 \n", + "1 2023-06-16T11:59:46-04:00 \n", + "2 2023-06-15T07:20:12-04:00 \n", + "3 2023-06-14T05:17:31-04:00 \n", + "4 2023-06-13T17:17:00-04:00 \n", + "5 2023-06-13T12:14:45-04:00 \n", + "6 2023-06-13T06:22:59-04:00 \n", + "7 2023-06-13T04:49:26-04:00 \n", + "8 2023-06-09T05:42:17-04:00 \n", + "9 2023-06-09T05:21:04-04:00 \n", + "\n", + " title commentCount \n", + "0 Artificial intelligence is a '1995 moment' for... 63 \n", + "1 Citi: Don't worry about Big Tech fueling 2023'... 17 \n", + "2 Google said to temper chatbot use for employee... 8 \n", + "3 Nvidia crosses $1T market cap powered by the r... 36 \n", + "4 Intel in talks to be anchor investor in chip d... 94 \n", + "5 Hot Stocks: AAPL falls on downgrade; MANU rise... 3 \n", + "6 Apple notches record close as bulls continue t... 18 \n", + "7 Apple cut to Neutral at UBS on softer iPhone a... 26 \n", + "8 AI looking like a 'winner-take-more' game - Go... 39 \n", + "9 Zuckerberg's vision for AR/VR headsets differe... 92 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"publishOn\",\"title\",\"commentCount\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reuters" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.reuters_streaming import Reuters_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Geting pages: 1 2 3 " + ] + } + ], + "source": [ + "news_downloader = Reuters_Streaming()\n", + "news_downloader.download_streaming_search(keyword = \"apple\", rounds = 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
published_timetitledescription
02023-06-19T10:17:24.474ZHong Kong tycoon Jimmy Lai loses appeal agains...A Hong Kong appeal court on Monday blocked jai...
12023-06-19T09:49:09.391ZPodcast: Blinken meets Xi and Chinese bankers ...U.S. Secretary of State Anthony Blinken is in ...
22023-06-19T02:10:24.581ZTheir parents made China the world's factory. ...When Steven Du took over his parents' factory ...
32023-06-17T08:14:15.708ZJapan to open up Apple- and Google-dominated p...Japan plans to stoke competition in smartphone...
42023-06-16T13:28:02.538ZKhashoggi's widow sues Israeli spyware company...The widow of murdered Saudi journalist Jamal K...
52023-06-16T11:12:02.899ZFactbox: DLE companies racing to reshape globa...Lithium, the metal used to make electric vehic...
62023-06-16T10:17:23.831ZPodcast: US-Iran talks and Australia divided o...Australia is divided on a historic referendum ...
72023-06-16T02:16:44.523ZChinese e-commerce giants entice cautious cons...China's e-commerce platforms are competing fie...
82023-06-15T10:21:02.697ZS&P 500 leaps to highest close in 14 months; t...The S&P 500 and Nasdaq surged on Thursday to c...
92023-06-15T19:49:27.459ZMicrosoft notches record high valuation of nea...Microsoft Corp shares rose to a new record hig...
\n", + "
" + ], + "text/plain": [ + " published_time \\\n", + "0 2023-06-19T10:17:24.474Z \n", + "1 2023-06-19T09:49:09.391Z \n", + "2 2023-06-19T02:10:24.581Z \n", + "3 2023-06-17T08:14:15.708Z \n", + "4 2023-06-16T13:28:02.538Z \n", + "5 2023-06-16T11:12:02.899Z \n", + "6 2023-06-16T10:17:23.831Z \n", + "7 2023-06-16T02:16:44.523Z \n", + "8 2023-06-15T10:21:02.697Z \n", + "9 2023-06-15T19:49:27.459Z \n", + "\n", + " title \\\n", + "0 Hong Kong tycoon Jimmy Lai loses appeal agains... \n", + "1 Podcast: Blinken meets Xi and Chinese bankers ... \n", + "2 Their parents made China the world's factory. ... \n", + "3 Japan to open up Apple- and Google-dominated p... \n", + "4 Khashoggi's widow sues Israeli spyware company... \n", + "5 Factbox: DLE companies racing to reshape globa... \n", + "6 Podcast: US-Iran talks and Australia divided o... \n", + "7 Chinese e-commerce giants entice cautious cons... \n", + "8 S&P 500 leaps to highest close in 14 months; t... \n", + "9 Microsoft notches record high valuation of nea... \n", + "\n", + " description \n", + "0 A Hong Kong appeal court on Monday blocked jai... \n", + "1 U.S. Secretary of State Anthony Blinken is in ... \n", + "2 When Steven Du took over his parents' factory ... \n", + "3 Japan plans to stoke competition in smartphone... \n", + "4 The widow of murdered Saudi journalist Jamal K... \n", + "5 Lithium, the metal used to make electric vehic... \n", + "6 Australia is divided on a historic referendum ... \n", + "7 China's e-commerce platforms are competing fie... \n", + "8 The S&P 500 and Nasdaq surged on Thursday to c... \n", + "9 Microsoft Corp shares rose to a new record hig... " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"published_time\",\"title\",\"description\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sina Finance" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.sina_finance_date_range import Sina_Finance_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2016-01-01\"\n", + "end_date = \"2016-01-01\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:04<00:00, 1.05it/s]\n", + "Checking ips: 100%|██████████| 75/75 [00:20<00:00, 3.67it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 75。Usable proxy ips: 75.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading Titles...: 100%|██████████| 1/1 [00:01<00:00, 1.54s/it]\n", + "Gathering news contents: 100%|██████████| 103/103 [00:22<00:00, 4.50it/s]\n" + ] + } + ], + "source": [ + "news_downloader = Sina_Finance_Date_Range(config)\n", + "news_downloader.download_date_range_all(start_date,end_date)\n", + "news_downloader.gather_content()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlecontent
0分析师:伊朗重回国际原油市场无法阻止新浪美股讯 北京时间1月1日晚CNBC称,加拿大皇家银行(RBC)分析师Helima Cro...
1FAA:波音767的逃生扶梯存在缺陷新浪美股讯 北京时间1日晚,美国联邦航空局(FAA)要求航空公司对波音767机型的救生扶梯进...
2非制造业新订单指数创新高 需求回升力度明显中新社北京1月1日电 (记者 刘长忠)记者1日从中国物流与采购联合会获悉,在最新发布的201...
3雷曼兄弟针对大和证券提起索赔诉讼新浪美股讯 北京时间1日下午共同社称,2008年破产的美国金融巨头雷曼兄弟公司的清算法人日前...
4国内钢铁PMI有所回升 钢市低迷形势有所改善新华社上海1月1日专电(记者李荣)据中物联钢铁物流专业委员会1日发布的指数报告,2015年1...
5马息岭凸显朝鲜旅游体育战略新浪美股北京时间1日讯 三位单板滑雪手将成为最早拜访马息岭滑雪场的西方专业运动员,他们本月就...
6五洲船舶破产清算 近十年来首现国有船厂倒闭(原标题:中国首家国有船厂破产倒闭)\\n低迷的中国造船市场,多年来首次出现国有船厂破产清算的...
7过半城市房价环比上涨 百城住宅均价加速升温资料图。中新社记者 武俊杰 摄\\n中新社北京1月1日电 (记者 庞无忌)中国房地产市场在20...
8经济学人:巴西病根到底在哪里新浪美股北京时间1日讯 原本,巴西人是该高高兴兴迎接2016年的。8月间,里约热内卢将举办南...
9中国首家国有船厂破产倒闭:五洲船舶目前已停工低迷的中国造船市场,多年来首次出现国有船厂破产清算的一幕。浙江海运集团旗下的五洲船舶修造公司...
\n", + "
" + ], + "text/plain": [ + " title content\n", + "0 分析师:伊朗重回国际原油市场无法阻止 新浪美股讯 北京时间1月1日晚CNBC称,加拿大皇家银行(RBC)分析师Helima Cro...\n", + "1 FAA:波音767的逃生扶梯存在缺陷 新浪美股讯 北京时间1日晚,美国联邦航空局(FAA)要求航空公司对波音767机型的救生扶梯进...\n", + "2 非制造业新订单指数创新高 需求回升力度明显 中新社北京1月1日电 (记者 刘长忠)记者1日从中国物流与采购联合会获悉,在最新发布的201...\n", + "3 雷曼兄弟针对大和证券提起索赔诉讼 新浪美股讯 北京时间1日下午共同社称,2008年破产的美国金融巨头雷曼兄弟公司的清算法人日前...\n", + "4 国内钢铁PMI有所回升 钢市低迷形势有所改善 新华社上海1月1日专电(记者李荣)据中物联钢铁物流专业委员会1日发布的指数报告,2015年1...\n", + "5 马息岭凸显朝鲜旅游体育战略 新浪美股北京时间1日讯 三位单板滑雪手将成为最早拜访马息岭滑雪场的西方专业运动员,他们本月就...\n", + "6 五洲船舶破产清算 近十年来首现国有船厂倒闭 (原标题:中国首家国有船厂破产倒闭)\\n低迷的中国造船市场,多年来首次出现国有船厂破产清算的...\n", + "7 过半城市房价环比上涨 百城住宅均价加速升温 资料图。中新社记者 武俊杰 摄\\n中新社北京1月1日电 (记者 庞无忌)中国房地产市场在20...\n", + "8 经济学人:巴西病根到底在哪里 新浪美股北京时间1日讯 原本,巴西人是该高高兴兴迎接2016年的。8月间,里约热内卢将举办南...\n", + "9 中国首家国有船厂破产倒闭:五洲船舶目前已停工 低迷的中国造船市场,多年来首次出现国有船厂破产清算的一幕。浙江海运集团旗下的五洲船舶修造公司..." + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"content\"]\n", + "news_downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Eastmoney" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.eastmoney_streaming import Eastmoney_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "stock = \"600519\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:04<00:00, 1.08it/s]\n", + "Checking ips: 100%|██████████| 75/75 [00:20<00:00, 3.62it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 75。Usable proxy ips: 75.\n", + "Geting pages: 0 1 2 Get total 3 pages.\n" + ] + } + ], + "source": [ + "news_downloader = Eastmoney_Streaming(config)\n", + "news_downloader.download_streaming_stock(stock,pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
read amountcommentstitlecontent linkauthorcreate time
014076茅台2022年报的12个小秘密/news,600519,1295554981.html贵州茅台资讯04-09 19:40
12340东北证券维持贵州茅台买入评级 预计2023年净利润同比/news,600519,1295512910.html公司研报提示04-09 11:24
23850贵州茅台:融资余额169.34亿元,创近一年新低(04-07/news,600519,1295407809.html贵州茅台资讯04-08 07:30
32330贵州茅台:融资净买入1248.48万元,融资余额169.79亿/news,600519,1294929438.html贵州茅台资讯04-07 07:28
4280416贵州茅台公益基金会正式成立/news,600519,1294612056.html贵州茅台资讯04-06 12:29
53330贵州茅台04月04日获沪股通增持19.55万股/news,600519,1294268016.html贵州茅台资讯04-05 07:48
63120贵州茅台:融资余额169.66亿元,创近一年新低(04-04/news,600519,1294265710.html贵州茅台资讯04-05 07:30
722721164月4日北向资金最新动向(附十大成交股)/news,600519,1294192188.html贵州茅台资讯04-04 18:48
86541大宗交易:贵州茅台成交235.9万元,成交价1814.59元(/news,600519,1294173281.html贵州茅台资讯04-04 17:21
92330第一上海证券维持贵州茅台买入评级 目标价2428.8元/news,600519,1293784734.html公司研报提示04-04 09:30
\n", + "
" + ], + "text/plain": [ + " read amount comments title \\\n", + "0 1407 6 茅台2022年报的12个小秘密 \n", + "1 234 0 东北证券维持贵州茅台买入评级 预计2023年净利润同比 \n", + "2 385 0 贵州茅台:融资余额169.34亿元,创近一年新低(04-07 \n", + "3 233 0 贵州茅台:融资净买入1248.48万元,融资余额169.79亿 \n", + "4 2804 16 贵州茅台公益基金会正式成立 \n", + "5 333 0 贵州茅台04月04日获沪股通增持19.55万股 \n", + "6 312 0 贵州茅台:融资余额169.66亿元,创近一年新低(04-04 \n", + "7 22721 16 4月4日北向资金最新动向(附十大成交股) \n", + "8 654 1 大宗交易:贵州茅台成交235.9万元,成交价1814.59元( \n", + "9 233 0 第一上海证券维持贵州茅台买入评级 目标价2428.8元 \n", + "\n", + " content link author create time \n", + "0 /news,600519,1295554981.html 贵州茅台资讯 04-09 19:40 \n", + "1 /news,600519,1295512910.html 公司研报提示 04-09 11:24 \n", + "2 /news,600519,1295407809.html 贵州茅台资讯 04-08 07:30 \n", + "3 /news,600519,1294929438.html 贵州茅台资讯 04-07 07:28 \n", + "4 /news,600519,1294612056.html 贵州茅台资讯 04-06 12:29 \n", + "5 /news,600519,1294268016.html 贵州茅台资讯 04-05 07:48 \n", + "6 /news,600519,1294265710.html 贵州茅台资讯 04-05 07:30 \n", + "7 /news,600519,1294192188.html 贵州茅台资讯 04-04 18:48 \n", + "8 /news,600519,1294173281.html 贵州茅台资讯 04-04 17:21 \n", + "9 /news,600519,1293784734.html 公司研报提示 04-04 09:30 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"title\", \"create time\"]\n", + "news_downloader[selected_columns].dataframe.head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finnhub / Yahoo" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.finnhub_date_range import Finnhub_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2023-01-01\"\n", + "end_date = \"2023-01-03\"\n", + "config = {\n", + " \"use_proxy\": \"us_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + " \"token\": \"YOUR_FINNHUB_TOKEN\" # Avaliable at https://finnhub.io/dashboard\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Checking ips: 100%|██████████| 75/75 [02:51<00:00, 2.28s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get proxy ips: 75.\n", + "Usable proxy ips: 75.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading Titles: 100%|██████████| 1/1 [00:02<00:00, 2.66s/it]\n", + "Gathering news contents: 48%|████▊ | 49/102 [03:18<02:51, 3.24s/it]c:\\Users\\Olive\\.conda\\envs\\finrl\\lib\\site-packages\\urllib3\\connectionpool.py:1052: InsecureRequestWarning: Unverified HTTPS request is being made to host 'thefly.com'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", + " InsecureRequestWarning,\n", + "Gathering news contents: 100%|██████████| 102/102 [06:15<00:00, 3.68s/it]\n" + ] + } + ], + "source": [ + "news_downloader = Finnhub_Date_Range(config)\n", + "news_downloader.download_date_range_stock(start_date,end_date)\n", + "news_downloader.gather_content()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorydatetimeheadlineidimagerelatedsourcesummaryurlcontent
0company2023-01-03 23:40:08My 26-Stock $349k Portfolio Gets A Nice Petrob...118107004https://media.gettyimages.com/id/1441204186/ph...AAPLSeekingAlphaMy portfolio, built specifically for my retire...https://finnhub.io/api/news?id=d3c15f6f365663b...Home\\nInvesting Strategy\\nPortfolio Strategy\\n...
1company2023-01-03 22:09:00Apple’s Market Cap Slides Below $2 Trillion fo...118105849AAPLYahooThe tech giant is one of only five U.S. compan...https://finnhub.io/api/news?id=42343678a7474e1...Error
\n", + "
" + ], + "text/plain": [ + " category datetime \\\n", + "0 company 2023-01-03 23:40:08 \n", + "1 company 2023-01-03 22:09:00 \n", + "\n", + " headline id \\\n", + "0 My 26-Stock $349k Portfolio Gets A Nice Petrob... 118107004 \n", + "1 Apple’s Market Cap Slides Below $2 Trillion fo... 118105849 \n", + "\n", + " image related source \\\n", + "0 https://media.gettyimages.com/id/1441204186/ph... AAPL SeekingAlpha \n", + "1 AAPL Yahoo \n", + "\n", + " summary \\\n", + "0 My portfolio, built specifically for my retire... \n", + "1 The tech giant is one of only five U.S. compan... \n", + "\n", + " url \\\n", + "0 https://finnhub.io/api/news?id=d3c15f6f365663b... \n", + "1 https://finnhub.io/api/news?id=42343678a7474e1... \n", + "\n", + " content \n", + "0 Home\\nInvesting Strategy\\nPortfolio Strategy\\n... \n", + "1 Error " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = news_downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
headlinecontent
0My 26-Stock $349k Portfolio Gets A Nice Petrob...Home\\nInvesting Strategy\\nPortfolio Strategy\\n...
1Apple’s Market Cap Slides Below $2 Trillion fo...Error
2US STOCKS-Wall St starts the year with a dip; ...(For a Reuters live blog on U.S., UK and Europ...
3Buy 4 January Dogs Of The Dow, Watch 4 MoreHome\\nDividends\\nDividend Quick Picks\\nBuy 4 J...
4Apple's stock market value falls below $2 tril...Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...
5CORRECTED-UPDATE 1-Apple's stock market value ...Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto...
6Apple Stock Falls Amid Report Of Product Order...Apple stock got off to a slow start in 2023 as...
7US STOCKS-Wall St starts the year with a dip; ...Summary\\nCompanies\\nTesla shares plunge on Q4 ...
8More than $1 trillion wiped off value of Apple...apple store\\nMore than $1 trillion has been wi...
9McLean's Iridium inks agreement to put its sat...The company hasn't named its partner, but it's...
\n", + "
" + ], + "text/plain": [ + " headline \\\n", + "0 My 26-Stock $349k Portfolio Gets A Nice Petrob... \n", + "1 Apple’s Market Cap Slides Below $2 Trillion fo... \n", + "2 US STOCKS-Wall St starts the year with a dip; ... \n", + "3 Buy 4 January Dogs Of The Dow, Watch 4 More \n", + "4 Apple's stock market value falls below $2 tril... \n", + "5 CORRECTED-UPDATE 1-Apple's stock market value ... \n", + "6 Apple Stock Falls Amid Report Of Product Order... \n", + "7 US STOCKS-Wall St starts the year with a dip; ... \n", + "8 More than $1 trillion wiped off value of Apple... \n", + "9 McLean's Iridium inks agreement to put its sat... \n", + "\n", + " content \n", + "0 Home\\nInvesting Strategy\\nPortfolio Strategy\\n... \n", + "1 Error \n", + "2 (For a Reuters live blog on U.S., UK and Europ... \n", + "3 Home\\nDividends\\nDividend Quick Picks\\nBuy 4 J... \n", + "4 Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto... \n", + "5 Jan 3 (Reuters) - Apple Inc's \\n(AAPL.O)\\n sto... \n", + "6 Apple stock got off to a slow start in 2023 as... \n", + "7 Summary\\nCompanies\\nTesla shares plunge on Q4 ... \n", + "8 apple store\\nMore than $1 trillion has been wi... \n", + "9 The company hasn't named its partner, but it's... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"headline\", \"content\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/test/Data_Sources_Social_Media.ipynb b/FinNLP/test/Data_Sources_Social_Media.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a346dd8ead1c9a11b44912cac227b1dabf41bd87 --- /dev/null +++ b/FinNLP/test/Data_Sources_Social_Media.ipynb @@ -0,0 +1,2261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"../../FinNLP\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Eastmoney" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.eastmoney_streaming import Eastmoney_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "stock = \"600519\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "downloader = Eastmoney_Streaming()\n", + "downloader.download_streaming_stock(stock, pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(241, 92)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
post_idpost_titlestockbar_codestockbar_namestockbar_typeuser_iduser_nicknameuser_extendinfospost_click_countpost_forward_count...relate_topiczwpage_flagsource_post_comment_countpost_atuserreply_listcontent_typerepost_statereptile_stateallow_likes_statepost_is_hot
01324058647贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元600519贵州茅台吧100.07344113638256342贵州茅台资讯{'user_accreditinfos': None, 'deactive': '0', ...379914...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

1 rows × 92 columns

\n", + "
" + ], + "text/plain": [ + " post_id post_title stockbar_code \\\n", + "0 1324058647 贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元 600519 \n", + "\n", + " stockbar_name stockbar_type user_id user_nickname \\\n", + "0 贵州茅台吧 100.0 7344113638256342 贵州茅台资讯 \n", + "\n", + " user_extendinfos post_click_count \\\n", + "0 {'user_accreditinfos': None, 'deactive': '0', ... 3799 \n", + "\n", + " post_forward_count ... relate_topic zwpage_flag \\\n", + "0 14 ... NaN NaN \n", + "\n", + " source_post_comment_count post_atuser reply_list content_type \\\n", + "0 NaN NaN NaN NaN \n", + "\n", + " repost_state reptile_state allow_likes_state post_is_hot \n", + "0 NaN NaN NaN NaN \n", + "\n", + "[1 rows x 92 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe.head(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
post_titleuser_nicknamestockbar_namepost_click_countpost_forward_countpost_comment_countpost_publish_timepost_last_timepost_display_time
0贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元贵州茅台资讯贵州茅台吧379914152023-06-25 22:17:502023-06-26 03:12:472023-06-25 22:17:50
1贵州茅台:贵州茅台2022年年度权益分派实施公告贵州茅台资讯贵州茅台吧642347172023-06-25 15:32:422023-06-26 00:57:392023-06-26 00:00:00
2将派发现金红利325.49亿元!贵州茅台上市以来累计分红超2000亿元贵州茅台资讯贵州茅台吧460102023-06-25 23:49:072023-06-25 23:49:072023-06-25 23:49:07
3茅台冰淇淋悄然卖数亿 年轻市场真被抓住了吗贵州茅台资讯贵州茅台吧261215112023-06-24 07:03:532023-06-25 18:48:212023-06-24 07:03:53
4白酒本周跌5.49%原因是什么?下周怎么看?NaNNaN101974252023-06-24 12:29:532023-06-25 23:12:492023-06-24 12:29:53
5本周持仓与下周交易计划满仓日记财富号评论吧547212023-06-25 20:30:542023-06-26 03:19:082023-06-25 20:30:54
6茅台酒的估值真的是高菩萨小跟班888贵州茅台吧33002023-06-26 03:02:142023-06-26 03:02:142023-06-26 03:02:14
7茅台里面的资金估计要出来支持一些中小微企业政策导向[吃瓜]菩萨小跟班888贵州茅台吧24002023-06-26 01:50:122023-06-26 01:50:122023-06-26 01:50:12
8每股市值收益率,还没有银行定期利息高呢。(远离泡沫浮云地震带)章鱼帝的智慧贵州茅台吧33012023-06-25 22:48:492023-06-26 01:20:042023-06-25 22:48:49
96月最后的倔强(浪潮信息,昆仑万维,鸿博股份)赛道复苏。夏夏爱美丽财富号评论吧24590342023-06-25 22:16:032023-06-26 00:45:532023-06-25 22:16:03
\n", + "
" + ], + "text/plain": [ + " post_title user_nickname stockbar_name \\\n", + "0 贵州茅台:每股派25.911元 6月30日共计派发现金红利325.49亿元 贵州茅台资讯 贵州茅台吧 \n", + "1 贵州茅台:贵州茅台2022年年度权益分派实施公告 贵州茅台资讯 贵州茅台吧 \n", + "2 将派发现金红利325.49亿元!贵州茅台上市以来累计分红超2000亿元 贵州茅台资讯 贵州茅台吧 \n", + "3 茅台冰淇淋悄然卖数亿 年轻市场真被抓住了吗 贵州茅台资讯 贵州茅台吧 \n", + "4 白酒本周跌5.49%原因是什么?下周怎么看? NaN NaN \n", + "5 本周持仓与下周交易计划 满仓日记 财富号评论吧 \n", + "6 茅台酒的估值真的是高 菩萨小跟班888 贵州茅台吧 \n", + "7 茅台里面的资金估计要出来支持一些中小微企业政策导向[吃瓜] 菩萨小跟班888 贵州茅台吧 \n", + "8 每股市值收益率,还没有银行定期利息高呢。(远离泡沫浮云地震带) 章鱼帝的智慧 贵州茅台吧 \n", + "9 6月最后的倔强(浪潮信息,昆仑万维,鸿博股份)赛道复苏。 夏夏爱美丽 财富号评论吧 \n", + "\n", + " post_click_count post_forward_count post_comment_count \\\n", + "0 3799 14 15 \n", + "1 6423 47 17 \n", + "2 460 1 0 \n", + "3 2612 15 11 \n", + "4 10197 4 25 \n", + "5 547 2 1 \n", + "6 33 0 0 \n", + "7 24 0 0 \n", + "8 33 0 1 \n", + "9 2459 0 34 \n", + "\n", + " post_publish_time post_last_time post_display_time \n", + "0 2023-06-25 22:17:50 2023-06-26 03:12:47 2023-06-25 22:17:50 \n", + "1 2023-06-25 15:32:42 2023-06-26 00:57:39 2023-06-26 00:00:00 \n", + "2 2023-06-25 23:49:07 2023-06-25 23:49:07 2023-06-25 23:49:07 \n", + "3 2023-06-24 07:03:53 2023-06-25 18:48:21 2023-06-24 07:03:53 \n", + "4 2023-06-24 12:29:53 2023-06-25 23:12:49 2023-06-24 12:29:53 \n", + "5 2023-06-25 20:30:54 2023-06-26 03:19:08 2023-06-25 20:30:54 \n", + "6 2023-06-26 03:02:14 2023-06-26 03:02:14 2023-06-26 03:02:14 \n", + "7 2023-06-26 01:50:12 2023-06-26 01:50:12 2023-06-26 01:50:12 \n", + "8 2023-06-25 22:48:49 2023-06-26 01:20:04 2023-06-25 22:48:49 \n", + "9 2023-06-25 22:16:03 2023-06-26 00:45:53 2023-06-25 22:16:03 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"post_title\",\"user_nickname\", \"stockbar_name\" ,\"post_click_count\", \"post_forward_count\", \"post_comment_count\", \"post_publish_time\", \"post_last_time\", \"post_display_time\"]\n", + "downloader.dataframe[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Facebook get cookies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "import json\n", + "\n", + "browser = webdriver.ChromiumEdge()\n", + "browser.get('https://www.facebook.com')\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Please login your account in the brower" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cookies = browser.get_cookies() \n", + "with open(\"cookies.json\", \"w\", encoding=\"utf-8\") as cks:\n", + " json.dump(cookies, cks)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Facebook" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.facebook_streaming import Facebook_Streaming\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# load cookies\n", + "with open(\"cookies.json\", \"r\", encoding=\"utf-8\") as cks: \n", + " cookies = json.load(cks)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "config = {\n", + " \"cookies\":cookies, \n", + " \"headless\": False,\n", + " \"stealth_path\":\"../../FinNLP/finnlp/data_sources/social_media/stealth.min.js\"\n", + " }\n", + "pages = 3\n", + "stock = \"AAPL\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 17/17 [00:57<00:00, 3.37s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Only support the first page now!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "downloader = Facebook_Streaming(config)\n", + "downloader.download_streaming_stock(stock, pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
contentdate
6AAPL (Stock Market)4h󰞋󰙷
8Day 7\\nIntroduction to Stock Market\\nWhat you ...6h󰞋󰙷
11US: AAPL new high and breakout from two-year r...1d󰞋󰙷
\n", + "
" + ], + "text/plain": [ + " content date\n", + "6 AAPL (Stock Market) 4h󰞋󰙷\n", + "8 Day 7\\nIntroduction to Stock Market\\nWhat you ... 6h󰞋󰙷\n", + "11 US: AAPL new high and breakout from two-year r... 1d󰞋󰙷" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Xueqiu / 雪球" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.xueqiu_streaming import Xueqiu_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "stock = \"茅台\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading ... 0 1 2 " + ] + } + ], + "source": [ + "downloader = Xueqiu_Streaming()\n", + "downloader.download_streaming_stock(stock, pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(29, 53)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
blockedblockingcanEditcommentIdcontroversialcreated_atdescriptiondonate_countdonate_snowcoineditable...truncated_bytypeuseruser_idview_countfirstImgpic_sizesedited_atquote_cardssymbol_id
0FalseFalseTrue0False2023-06-25 12:15:07<a href=\"http://xueqiu.com/S/SZ000860\" target=...00True...02{'allow_all_stock': False, 'block_status': 0, ...8364804052471NaNNaNNaNNaNNaN
\n", + "

1 rows × 53 columns

\n", + "
" + ], + "text/plain": [ + " blocked blocking canEdit commentId controversial created_at \\\n", + "0 False False True 0 False 2023-06-25 12:15:07 \n", + "\n", + " description donate_count \\\n", + "0 \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_atdescriptiontitletexttargetsourceuser
02023-06-25 12:15:07<a href=\"http://xueqiu.com/S/SZ000860\" target=...<a href=\"http://xueqiu.com/S/SZ000860\" target=.../8364804052/253976413Android{'allow_all_stock': False, 'block_status': 0, ...
12023-06-25 12:14:22<a href=\"http://xueqiu.com/S/SH600519\" target=...<p><a href=\"http://xueqiu.com/S/SH600519\" targ.../4631817224/253976390雪球{'allow_all_stock': False, 'block_status': 0, ...
22023-06-25 12:13:01...提高。白酒:五粮液、迎驾贡酒、<span class='highlight'>茅台</...6.25 赛道和白马的机会<p>这个假期外围的环境不太好,已经是基本共识了。明天开盘大A承压低开也基本是一致预期。这么.../4322952939/253976335雪球{'allow_all_stock': False, 'block_status': 0, ...
32023-06-25 11:58:55茅台发生活费了茅台发生活费了<br/><img class=\"ke_img\" src=\"https://x.../4653939718/253975764iPhone{'allow_all_stock': False, 'block_status': 0, ...
42023-06-25 11:54:05...业绩及股价,形成正反馈。当年<span class='highlight'>茅台</s...持仓吹票,共同致富<p><a href=\"http://xueqiu.com/k?q=%23%E4%BB%A5.../8113901491/253975613Android{'allow_all_stock': False, 'block_status': 0, ...
52023-06-25 11:50:11微酒酒业快讯,6月25日,酒业新闻一览-·企业动态·-01<span class='high...6.25:<span class='highlight'>茅</span><span cla...<p><img class=\"ke_img\" src=\"https://xqimg.imed.../3615583399/253975485雪球{'allow_all_stock': False, 'block_status': 0, ...
62023-06-25 11:48:42<a href=\"http://xueqiu.com/S/SH603027\" target=...<a href=\"http://xueqiu.com/S/SH603027\" target=.../2659542807/253975430iPhone{'allow_all_stock': False, 'block_status': 0, ...
72023-06-25 11:45:54段永平说:我不鼓励小散投<a href=\"https://xueqiu.com/S/AAPL...段永平说:我不鼓励小散投<a href=\"https://xueqiu.com/S/AAPL.../9456980430/253975338iPhone{'allow_all_stock': False, 'block_status': 0, ...
82023-06-25 11:33:01泸州老窖酒传统酿制技艺第二十三代传承人·国窖1573·曾娜大师鉴藏版,端午举杯小酒。<br/...泸州老窖酒传统酿制技艺第二十三代传承人·国窖1573·曾娜大师鉴藏版,端午举杯小酒。<br/.../9893982765/253974916Android{'allow_all_stock': False, 'block_status': 0, ...
92023-06-25 11:25:44...酒店中,白酒卖得最好的往往不是<span class='highlight'>茅台</...街头没生意的烟酒店,为什么不会倒闭<p><img class=\"ke_img\" src=\"https://xqimg.imed.../5497522856/253974630雪球{'allow_all_stock': False, 'block_status': 0, ...
\n", + "" + ], + "text/plain": [ + " created_at description \\\n", + "0 2023-06-25 12:15:07
茅台茅台茅台这个假期外围的环境不太好,已经是基本共识了。明天开盘大A承压低开也基本是一致预期。这么... /4322952939/253976335 \n", + "3 茅台发生活费了
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idbodycreated_atusersourcesymbolspricesmentioned_usersentitiesliked_by_selfreshared_by_selflinksreshare_messageconversationlikesresharesnetwork
0522005335NANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL...2023-04-07T15:24:22Z{'id': 4744627, 'username': 'JavierAyala', 'na...{'id': 1149, 'title': 'StockTwits for iOS', 'u...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[]{'sentiment': None}FalseFalseNaNNaNNaNNaNNaNNaN
1522004768$AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi...2023-04-07T15:17:43Z{'id': 6330207, 'username': 'PlainFacts_2121',...{'id': 2269, 'title': 'StockTwits Web', 'url':...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '...[]{'sentiment': None}FalseFalse[{'title': 'China officials who abused health ...NaNNaNNaNNaNNaN
\n", + "" + ], + "text/plain": [ + " id body \\\n", + "0 522005335 NANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL... \n", + "1 522004768 $AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi... \n", + "\n", + " created_at user \\\n", + "0 2023-04-07T15:24:22Z {'id': 4744627, 'username': 'JavierAyala', 'na... \n", + "1 2023-04-07T15:17:43Z {'id': 6330207, 'username': 'PlainFacts_2121',... \n", + "\n", + " source \\\n", + "0 {'id': 1149, 'title': 'StockTwits for iOS', 'u... \n", + "1 {'id': 2269, 'title': 'StockTwits Web', 'url':... \n", + "\n", + " symbols \\\n", + "0 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... \n", + "1 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... \n", + "\n", + " prices mentioned_users \\\n", + "0 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... [] \n", + "1 [{'id': 686, 'symbol': 'AAPL', 'symbol_mic': '... [] \n", + "\n", + " entities liked_by_self reshared_by_self \\\n", + "0 {'sentiment': None} False False \n", + "1 {'sentiment': None} False False \n", + "\n", + " links reshare_message \\\n", + "0 NaN NaN \n", + "1 [{'title': 'China officials who abused health ... NaN \n", + "\n", + " conversation likes reshares network \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_atbody
02023-04-07T15:24:22ZNANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL...
12023-04-07T15:17:43Z$AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi...
22023-04-07T15:17:25Z$AAPL $GOOG $AMZN I took a Trump today. \\n\\nH...
32023-04-07T15:16:54Z$SPY $AAPL will take this baby down, time for ...
42023-04-07T15:11:37Z$SPY $3T it ALREADY DID - look at the pre-COV...
52023-04-07T15:10:29Z$AAPL $QQQ $STUDY We are on to the next one! A...
62023-04-07T15:06:00Z$AAPL was analyzed by 48 analysts. The buy con...
72023-04-07T14:54:29Z$AAPL both retiring. \\n \\nCraig....
82023-04-07T14:40:06Z$SPY $QQQ $TSLA $AAPL SPY 500 HAS STARTED🚀😍 BI...
92023-04-07T14:38:57ZNancy 🩵 (Tim) $AAPL
\n", + "
" + ], + "text/plain": [ + " created_at body\n", + "0 2023-04-07T15:24:22Z NANCY PELOSI JUST BOUGHT 10,000 SHARES OF APPL...\n", + "1 2023-04-07T15:17:43Z $AAPL $SPY \\n \\nhttps://amp.scmp.com/news/chi...\n", + "2 2023-04-07T15:17:25Z $AAPL $GOOG $AMZN I took a Trump today. \\n\\nH...\n", + "3 2023-04-07T15:16:54Z $SPY $AAPL will take this baby down, time for ...\n", + "4 2023-04-07T15:11:37Z $SPY $3T it ALREADY DID - look at the pre-COV...\n", + "5 2023-04-07T15:10:29Z $AAPL $QQQ $STUDY We are on to the next one! A...\n", + "6 2023-04-07T15:06:00Z $AAPL was analyzed by 48 analysts. The buy con...\n", + "7 2023-04-07T14:54:29Z $AAPL both retiring. \\n \\nCraig....\n", + "8 2023-04-07T14:40:06Z $SPY $QQQ $TSLA $AAPL SPY 500 HAS STARTED🚀😍 BI...\n", + "9 2023-04-07T14:38:57Z Nancy 🩵 (Tim) $AAPL" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"created_at\", \"body\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reddit Wallstreetbets Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.reddit_streaming import Reddit_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 3\n", + "config = {\n", + " # \"use_proxy\": \"us_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 2,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading by pages...: 100%|██████████| 3/3 [00:08<00:00, 2.83s/it]\n" + ] + } + ], + "source": [ + "downloader = Reddit_Streaming(config)\n", + "downloader.download_streaming_all(pages)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnumCommentscreatedscoredistinguishTypeisLockedisStickiedthumbnailtitleauthor...postEventInfopredictionTournamentreactedFromremovedByremovedByCategorysubredditsuggestedCommentSorttopAwardedTypeurlwhitelistStatus
0t3_12epaq0816808819740000NoneFalseFalse{'url': 'https://b.thumbs.redditmedia.com/W8hd...Y’all making me feel like spoodermanghostwholags...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1t3_zr9v10016715957820002NoneTrueFalse{'url': 'https://b.thumbs.redditmedia.com/dJqb...Do you track your investments in a spreadsheet...sharesight...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

2 rows × 100 columns

\n", + "
" + ], + "text/plain": [ + " id numComments created score distinguishType isLocked \\\n", + "0 t3_12epaq0 8 1680881974000 0 None False \n", + "1 t3_zr9v10 0 1671595782000 2 None True \n", + "\n", + " isStickied thumbnail \\\n", + "0 False {'url': 'https://b.thumbs.redditmedia.com/W8hd... \n", + "1 False {'url': 'https://b.thumbs.redditmedia.com/dJqb... \n", + "\n", + " title author ... \\\n", + "0 Y’all making me feel like spooderman ghostwholags ... \n", + "1 Do you track your investments in a spreadsheet... sharesight ... \n", + "\n", + " postEventInfo predictionTournament reactedFrom removedBy removedByCategory \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + " subreddit suggestedCommentSort topAwardedType url whitelistStatus \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + "[2 rows x 100 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnumCommentscreatedscoredistinguishTypeisLockedisStickiedthumbnailtitleauthor...postEventInfopredictionTournamentreactedFromremovedByremovedByCategorysubredditsuggestedCommentSorttopAwardedTypeurlwhitelistStatus
0t3_12epaq082023-04-07 15:39:340NoneFalseFalse{'url': 'https://b.thumbs.redditmedia.com/W8hd...Y’all making me feel like spoodermanghostwholags...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1t3_zr9v1002022-12-21 04:09:422NoneTrueFalse{'url': 'https://b.thumbs.redditmedia.com/dJqb...Do you track your investments in a spreadsheet...sharesight...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

2 rows × 100 columns

\n", + "
" + ], + "text/plain": [ + " id numComments created score distinguishType isLocked \\\n", + "0 t3_12epaq0 8 2023-04-07 15:39:34 0 None False \n", + "1 t3_zr9v10 0 2022-12-21 04:09:42 2 None True \n", + "\n", + " isStickied thumbnail \\\n", + "0 False {'url': 'https://b.thumbs.redditmedia.com/W8hd... \n", + "1 False {'url': 'https://b.thumbs.redditmedia.com/dJqb... \n", + "\n", + " title author ... \\\n", + "0 Y’all making me feel like spooderman ghostwholags ... \n", + "1 Do you track your investments in a spreadsheet... sharesight ... \n", + "\n", + " postEventInfo predictionTournament reactedFrom removedBy removedByCategory \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + " subreddit suggestedCommentSort topAwardedType url whitelistStatus \n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN \n", + "\n", + "[2 rows x 100 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df[\"created\"] = pd.to_datetime(df[\"created\"], unit = \"ms\")\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
createdtitle
02023-04-07 15:39:34Y’all making me feel like spooderman
12022-12-21 04:09:42Do you track your investments in a spreadsheet...
22022-12-21 04:09:42Do you track your investments in a spreadsheet...
32023-04-07 15:29:23Can a Blackberry holder get some help 🥺
42023-04-07 14:49:55The week of CPI and FOMC Minutes… 4-6-23 SPY/ ...
52023-04-07 14:19:22Well let’s hope your job likes you, thanks Jerome
62023-04-07 14:06:32Does anyone else feel an overwhelming sense of...
72023-04-07 13:47:59Watermarked Jesus explains the market being cl...
82023-04-07 13:26:23Jobs report shows 236,000 gain in March. Hot l...
92023-04-07 13:07:15The recession is over! Let's buy more stocks!
\n", + "
" + ], + "text/plain": [ + " created title\n", + "0 2023-04-07 15:39:34 Y’all making me feel like spooderman\n", + "1 2022-12-21 04:09:42 Do you track your investments in a spreadsheet...\n", + "2 2022-12-21 04:09:42 Do you track your investments in a spreadsheet...\n", + "3 2023-04-07 15:29:23 Can a Blackberry holder get some help 🥺\n", + "4 2023-04-07 14:49:55 The week of CPI and FOMC Minutes… 4-6-23 SPY/ ...\n", + "5 2023-04-07 14:19:22 Well let’s hope your job likes you, thanks Jerome\n", + "6 2023-04-07 14:06:32 Does anyone else feel an overwhelming sense of...\n", + "7 2023-04-07 13:47:59 Watermarked Jesus explains the market being cl...\n", + "8 2023-04-07 13:26:23 Jobs report shows 236,000 gain in March. Hot l...\n", + "9 2023-04-07 13:07:15 The recession is over! Let's buy more stocks!" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"created\", \"title\"]\n", + "df[selected_columns].head(10)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Weibo Date Range" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.weibo_date_range import Weibo_Date_Range" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2016-01-01\"\n", + "end_date = \"2016-01-02\"\n", + "stock = \"茅台\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + " \"cookies\": \"Your_Login_Cookies\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:09<00:00, 1.95s/it]\n", + "Checking ips: 100%|██████████| 75/75 [01:23<00:00, 1.11s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 13。Usable proxy ips: 13.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading by dates...: 100%|██████████| 2/2 [01:03<00:00, 31.56s/it]\n" + ] + } + ], + "source": [ + "downloader = Weibo_Date_Range(config)\n", + "downloader.download_date_range_stock(start_date, end_date, stock = stock)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datedate_contentsourcecontent
02016-01-012016年01月01日23:41Moto X#舆论之锤#唯品会发声明证实销售假茅台-手机腾讯网O网页链接分享来自浏览器!
22016-01-012016年01月01日22:57新浪博客2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原...
62016-01-012016年01月01日22:56新浪博客2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原...
172016-01-012016年01月01日22:40五蕴皆崆Android开心,今天喝了两斤酒(茅台+扎二)三个人,开心!
182016-01-01NaNNaN一家专卖假货的网站某宝,你该学学了!//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品...
192016-01-01NaNNaN一家专卖假货的网站//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品会售假茅台:供货商...
202016-01-012016年01月01日21:46360安全浏览器前几天说了几点不看好茅台的理由,今年过节喝点茅台支持下,个人口感,茅台比小五好喝,茅台依然是...
212016-01-012016年01月01日21:44华为P8老杜酱酒已到货,从明天起正式在甘肃武威开卖。可以不相信我说的话,但一定不要怀疑@杜子建的为人...
222016-01-012016年01月01日21:24华为Ascend P7【唯品会售假茅台后续:供货商被刑拘顾客获十倍补偿】此前,有网友投诉其在唯品会购买的茅台酒质量...
232016-01-012016年01月01日21:16实得惠省钱网唯品会卖假茅台,供货商被刑拘,买家获十倍补偿8888元|此前,有网友在网络论坛发贴(唯品会宣...
\n", + "
" + ], + "text/plain": [ + " date date_content source \\\n", + "0 2016-01-01 2016年01月01日23:41 Moto X \n", + "2 2016-01-01 2016年01月01日22:57 新浪博客 \n", + "6 2016-01-01 2016年01月01日22:56 新浪博客 \n", + "17 2016-01-01 2016年01月01日22:40 五蕴皆崆Android \n", + "18 2016-01-01 NaN NaN \n", + "19 2016-01-01 NaN NaN \n", + "20 2016-01-01 2016年01月01日21:46 360安全浏览器 \n", + "21 2016-01-01 2016年01月01日21:44 华为P8 \n", + "22 2016-01-01 2016年01月01日21:24 华为Ascend P7 \n", + "23 2016-01-01 2016年01月01日21:16 实得惠省钱网 \n", + "\n", + " content \n", + "0 #舆论之锤#唯品会发声明证实销售假茅台-手机腾讯网O网页链接分享来自浏览器! \n", + "2 2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原... \n", + "6 2016元旦节快乐酒粮网官方新品首发,茅台镇老酒,酱香原浆酒:酒粮网茅台镇白酒酱香老酒纯粮原... \n", + "17 开心,今天喝了两斤酒(茅台+扎二)三个人,开心! \n", + "18 一家专卖假货的网站某宝,你该学学了!//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品... \n", + "19 一家专卖假货的网站//【唯品会售假茅台:供货商被刑拘顾客获十倍补偿】O唯品会售假茅台:供货商... \n", + "20 前几天说了几点不看好茅台的理由,今年过节喝点茅台支持下,个人口感,茅台比小五好喝,茅台依然是... \n", + "21 老杜酱酒已到货,从明天起正式在甘肃武威开卖。可以不相信我说的话,但一定不要怀疑@杜子建的为人... \n", + "22 【唯品会售假茅台后续:供货商被刑拘顾客获十倍补偿】此前,有网友投诉其在唯品会购买的茅台酒质量... \n", + "23 唯品会卖假茅台,供货商被刑拘,买家获十倍补偿8888元|此前,有网友在网络论坛发贴(唯品会宣... " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df = df.drop_duplicates()\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60, 4)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Weibo Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.weibo_streaming import Weibo_Streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "rounds = 3\n", + "stock = \"茅台\"\n", + "config = {\n", + " \"use_proxy\": \"china_free\",\n", + " \"max_retry\": 5,\n", + " \"proxy_pages\": 5,\n", + " \"cookies\": \"Your_Login_Cookies\",\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Gathering free ips by pages...: 100%|██████████| 5/5 [00:09<00:00, 1.98s/it]\n", + "Checking ips: 100%|██████████| 75/75 [01:26<00:00, 1.15s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "获取到的代理ip数量: 75 。Get proxy ips: 75.\n", + "能用的代理数量: 19。Usable proxy ips: 19.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing the text content and downloading the full passage...: 100%|██████████| 9/9 [00:00<00:00, 64.89it/s]\n", + "Processing the text content and downloading the full passage...: 100%|██████████| 10/10 [00:09<00:00, 1.07it/s]\n", + "Processing the text content and downloading the full passage...: 100%|██████████| 10/10 [00:02<00:00, 4.93it/s]\n", + "Downloading by page..: 100%|██████████| 3/3 [00:19<00:00, 6.46s/it]\n" + ] + } + ], + "source": [ + "downloader = Weibo_Streaming(config)\n", + "downloader.download_streaming_stock(stock = stock, rounds = rounds)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
card_typedisplay_followbtnmblogitemidactionlogcate_iddisplay_arrowshow_typeschemecontainer_colorcontainer_color_darkcontent_shortcontent
09False{'attitudes_count': 0, 'can_edit': False, 'com...seqid:187118896|type:61|t:|pos:1-0-0|q:茅台|srid...{'act_code': 554, 'ext': 'seqid:187118896|type...3101https://m.weibo.cn/status/MAWMprpPp?mblogid=MA...#EEEEEE#151515事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...
19False{'attitudes_count': 0, 'can_edit': False, 'com...seqid:187118896|type:61|t:|pos:1-0-1|q:茅台|srid...{'act_code': 554, 'ext': 'seqid:187118896|type...3101https://m.weibo.cn/status/MAWHVDm0H?mblogid=MA...#EEEEEE#151515茅台茅台成都收4瓶飞天,自提茅台茅台成都收4瓶飞天,自提
\n", + "
" + ], + "text/plain": [ + " card_type display_followbtn \\\n", + "0 9 False \n", + "1 9 False \n", + "\n", + " mblog \\\n", + "0 {'attitudes_count': 0, 'can_edit': False, 'com... \n", + "1 {'attitudes_count': 0, 'can_edit': False, 'com... \n", + "\n", + " itemid \\\n", + "0 seqid:187118896|type:61|t:|pos:1-0-0|q:茅台|srid... \n", + "1 seqid:187118896|type:61|t:|pos:1-0-1|q:茅台|srid... \n", + "\n", + " actionlog cate_id display_arrow \\\n", + "0 {'act_code': 554, 'ext': 'seqid:187118896|type... 31 0 \n", + "1 {'act_code': 554, 'ext': 'seqid:187118896|type... 31 0 \n", + "\n", + " show_type scheme \\\n", + "0 1 https://m.weibo.cn/status/MAWMprpPp?mblogid=MA... \n", + "1 1 https://m.weibo.cn/status/MAWHVDm0H?mblogid=MA... \n", + "\n", + " container_color container_color_dark \\\n", + "0 #EEEEEE #151515 \n", + "1 #EEEEEE #151515 \n", + "\n", + " content_short \\\n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 \n", + "\n", + " content \n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = downloader.dataframe\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
content_shortcontent
0事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市...
1茅台茅台成都收4瓶飞天,自提茅台茅台成都收4瓶飞天,自提
2我可太喜欢茅台这个防伪了我可太喜欢茅台这个防伪了
3没想到 4S店的二楼 是卖茅台的吧没想到 4S店的二楼 是卖茅台的吧
4买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场
5xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然...xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然...
6茅台 奎屯出一只兔茅茅台 奎屯出一只兔茅
72022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和...2022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和...
841岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+...41岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+...
0吃到了茅台冰激淋也吃到了茅台冰激淋也
\n", + "
" + ], + "text/plain": [ + " content_short \\\n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 \n", + "2 我可太喜欢茅台这个防伪了 \n", + "3 没想到 4S店的二楼 是卖茅台的吧 \n", + "4 买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场 \n", + "5 xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然... \n", + "6 茅台 奎屯出一只兔茅 \n", + "7 2022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和... \n", + "8 41岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+... \n", + "0 吃到了茅台冰激淋也 \n", + "\n", + " content \n", + "0 事情做好做精,还可以赚大钱的生意才是好生意,而不是忙忙碌碌,最后一算账没赚多少!比如苹果的市... \n", + "1 茅台茅台成都收4瓶飞天,自提 \n", + "2 我可太喜欢茅台这个防伪了 \n", + "3 没想到 4S店的二楼 是卖茅台的吧 \n", + "4 买不起茅台,砸锅卖铁也得买得起茅台冰淇淋 许昌·胖东来时代广场 \n", + "5 xxx给我枇杷xxx给我蜂蜜 xxx偷茅台喝(假的)。我很喜欢自己家的产品,感觉很无害纯天然... \n", + "6 茅台 奎屯出一只兔茅 \n", + "7 2022胡润酒类品牌榜发布 2022胡润酒类品牌榜发布点评:与我印象中的有点出入。不出茅台和... \n", + "8 41岁,很美妙!“爸爸生日快乐,吃个蛋糕🍰”小奶音听着上头。爱人,亲戚,朋友,草莓🍓,茅台+... \n", + "0 吃到了茅台冰激淋也 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "selected_columns = [\"content_short\", \"content\"]\n", + "df[selected_columns].head(10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/test/Data_Sources_Tushare_Finance_News.ipynb b/FinNLP/test/Data_Sources_Tushare_Finance_News.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3036deb1ac0ceb6c191bb04fb3afc2af1d73d440 --- /dev/null +++ b/FinNLP/test/Data_Sources_Tushare_Finance_News.ipynb @@ -0,0 +1,682 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.news.tushare_major_news import Tushare_Major_News" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Config" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "start_date = \"2022-01-01\"\n", + "end_date = \"2022-01-05\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### News_downloader" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "news_downloader = Tushare_finance()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3c0d1c2084844174ba7d851a7e118f94", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:00\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mnews_downloader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdownload_news\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstart_date\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mend_date\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_133628\\3702496861.py\u001b[0m in \u001b[0;36mdownload_news\u001b[1;34m(self, start_date, end_date, stock)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mdate\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdate_list\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 17\u001b[1;33m \u001b[0mtmp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgather_one_day_news\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdate\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 18\u001b[0m \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mres\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mtmp\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 19\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdataframe\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mres\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_133628\\3702496861.py\u001b[0m in \u001b[0;36mgather_one_day_news\u001b[1;34m(self, date, stock, delay)\u001b[0m\n\u001b[0;32m 21\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mgather_one_day_news\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdate\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mstock\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"all\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mdelay\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0.1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[0mdate\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtransfer_standard_date_to_nonstandard\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdate\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpro\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmajor_news\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstart_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdate\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mend_date\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdate\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 24\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdelay\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\Users\\Olive\\.conda\\envs\\finrl\\lib\\site-packages\\tushare\\pro\\client.py\u001b[0m in \u001b[0;36mquery\u001b[1;34m(self, api_name, fields, **kwargs)\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mres\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 43\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'code'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 44\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'msg'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 45\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'data'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'fields'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mException\u001b[0m: 抱歉,您每小时最多访问该接口4次,权限的具体详情访问:https://tushare.pro/document/1?doc_id=108。" + ] + } + ], + "source": [ + "news_downloader.download_news(start_date,end_date)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(60, 3)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlepub_timesrc
0市政协委员、奇安信集团董事长齐向东:建议北京市推进“第三方数据监管平台”和“跨行业反诈大数据...2022-01-04 23:59:24同花顺
1新增30家!海珠公布第二批校外培训机构预收费资金监管账户,家长要留心2022-01-04 23:45:12同花顺
2市政协委员、北京市石景山区督查督办科科长杨朝霞:尽快对全市6100余个“骑沿井”进行改造2022-01-04 23:43:55同花顺
3延庆区教委:区内冰雪技能普及人数累计达到3.1万余人2022-01-04 23:35:26同花顺
4RCEP生效 为全球经济注入新动能2022-01-04 23:33:46同花顺
5关于召开“2021年度江苏省钢铁行业十大新闻”、“2021年度钢铁工业十大技术成果”等专家评...2022-01-04 23:32:18同花顺
63450万元卖参股公司0.23%股权 2名独董投弃权票 诚迈科技收关注函2022-01-04 23:25:38新浪财经
7天津《企业开办登记规范》地方标准发布 助企业开办1个工作日办结2022-01-04 23:22:40同花顺
8我国将实施企业信用风险分类管理2022-01-04 23:22:27同花顺
9德国去年12月失业率降至5.2%2022-01-04 23:21:18同花顺
10元旦假期南京共出行约1197.89万人次2022-01-04 23:16:38同花顺
11统计上严重失信企业信息公示:柳州市金百汇激光技术有限责任公司2022-01-04 23:10:35同花顺
12关于对浙江天铁实业股份有限公司及相关人员采取出具警示函措施的决定2022-01-04 23:10:26同花顺
13就在明天,2022年首只银行股将申购2022-01-04 23:08:43新浪财经
14近15亿元资产1元钱就卖?交易所问询,公司回应来了2022-01-04 23:08:25新浪财经
15冲击高端市场迈出第一步 realme发布真我GT2 Pro旗舰手机2022-01-04 23:07:37同花顺
16倒计时一个月!北京冬奥会开幕临近,概念股提前起飞,这些上市公司积极备战2022-01-04 23:07:36新浪财经
17倒计时一个月!北京冬奥会开幕临近,概念股提前起飞,这些上市公司积极备战2022-01-04 23:05:47同花顺
18企业实施碳中和战略要把握五个关键点2022-01-04 23:05:06同花顺
19气候投融资试点工作方案发布地方 政府可从五大方面推进2022-01-04 23:04:49同花顺
20网信办等4部门规范管理算法推荐 未来算法产业方向明朗2022-01-04 23:04:04同花顺
21绿色投资不是情怀 券商ESG行动需不断突破2022-01-04 23:03:31同花顺
22中药掀涨停潮CXO遭遇重挫 系资金切换过激所致2022-01-04 23:03:20同花顺
23TCL发布第一款笔记本电脑,售价349美元主打轻便与性价比2022-01-04 23:01:53新浪财经
24美股三大股指涨跌不一 热门中概股持续走低掌门教育盘中跌超21%2022-01-04 23:00:31凤凰财经
252.16亿股!中国泛海转让民生银行股权,所持泛海控股超1亿股也将被拍卖2022-01-04 22:56:55新浪财经
26柳州市举办先进激光技术应用论坛2022-01-04 22:56:09同花顺
27喜讯!建设集团中标贵州省城乡和住房建设厅两个磷石膏研发科技项目2022-01-04 22:55:32同花顺
28盘点2021家电行业:回购股票激励员工、多元化布局追“风口”2022-01-04 22:52:40同花顺
29泰康系减持阳光城股份亏损超16亿元 接盘方实力存疑2022-01-04 22:52:32同花顺
30各地政策密集落地,A股氢燃料汽车板块未来的潜力点在哪?2022-01-04 22:52:24新浪财经
31【2022年行业展望】汽车金融公司:资本充足,盈利性强,资产质量好,信用展望稳定2022-01-04 22:52:10同花顺
32公安部新修制订3部部门规章2022-01-04 22:51:56同花顺
33市政协委员欧云崧:尽快推广实施共享单车自主划定禁停区2022-01-04 22:51:01同花顺
34仁桥资产夏俊杰:2022年最佳风险收益比的机会在港股2022-01-04 22:47:55同花顺
35进化论资产王一平:2022年从四个方向挖掘投资机会2022-01-04 22:47:50同花顺
36元旦档总票房超10亿收官,12部电影将激战春节档2022-01-04 22:45:16同花顺
37美国酒商激战进行时2022-01-04 22:44:47同花顺
38中集分拆化工装备板块上市 中集安瑞环科闯关创业板2022-01-04 22:43:46新浪财经
39新年首个交易日“宁王”跌3.25%,新能源行情是否结束了2022-01-04 22:39:43新浪财经
40把怀柔科学城打造成“梦之城”,政协委员谈“科技创新中心建设”2022-01-04 22:36:21同花顺
41银联网络元旦消费金额创新高2022-01-04 22:34:47同花顺
42美国5年期国债收益率创2020年2月来最高 市场愈发坚定美联储加息预期2022-01-04 22:34:14同花顺
432022年北交所打新第一股来了!募资约1.54亿元,吸引9家战投认购2022-01-04 22:33:20新浪财经
44北交所首单重大资产重组出炉!中航泰达拟4.68亿元参与北方稀土子公司混改,明日起复牌2022-01-04 22:33:00新浪财经
45车企争秀2021年新能源“成绩单”!特斯拉全球交付量超93万辆,“蔚小理”年销已逼近10万辆关口2022-01-04 22:31:26同花顺
46再次押注海外休闲游戏,字节跳动是如何布局游戏行业投资的2022-01-04 22:30:50同花顺
47中长期利率趋稳2022-01-04 22:28:57同花顺
48滴血验癌公司创始人被定罪,指控其犯有电信欺诈罪2022-01-04 22:28:03同花顺
49HMVOD视频拟购入泰国节目2022-01-04 22:27:07同花顺
50国务院批准:在海南这个地方开展高水平开放试点2022-01-04 22:23:36同花顺
515日零时起,东莞市全域均为低风险地区2022-01-04 22:22:27同花顺
52缺芯”压力仍存,大众在华转型进入“阵痛期2022-01-04 22:22:19同花顺
53全国首例!苏州立案查处一起全国碳市场碳排放配额未按期履约案件2022-01-04 22:22:00同花顺
54分析人士:印尼禁止煤炭出口对我国影响有限2022-01-04 22:21:27同花顺
55如祺出行获战略投资,车企布局出行市场为哪般2022-01-04 22:21:26同花顺
56美国过去一周激增新冠确诊病例336万 美媒:数据“仍然被低估了”2022-01-04 22:20:38同花顺
57国家发展改革委特别安排海南专项100亿元2022-01-04 22:17:51同花顺
58牧原、双汇牵手:多方看好的“联姻”为何迟到10年2022-01-04 22:17:32新浪财经
59发起“全民热身月” 赖茅跨界体育深耕消费市场2022-01-04 22:15:47同花顺
\n", + "
" + ], + "text/plain": [ + " title pub_time \\\n", + "0 市政协委员、奇安信集团董事长齐向东:建议北京市推进“第三方数据监管平台”和“跨行业反诈大数据... 2022-01-04 23:59:24 \n", + "1 新增30家!海珠公布第二批校外培训机构预收费资金监管账户,家长要留心 2022-01-04 23:45:12 \n", + "2 市政协委员、北京市石景山区督查督办科科长杨朝霞:尽快对全市6100余个“骑沿井”进行改造 2022-01-04 23:43:55 \n", + "3 延庆区教委:区内冰雪技能普及人数累计达到3.1万余人 2022-01-04 23:35:26 \n", + "4 RCEP生效 为全球经济注入新动能 2022-01-04 23:33:46 \n", + "5 关于召开“2021年度江苏省钢铁行业十大新闻”、“2021年度钢铁工业十大技术成果”等专家评... 2022-01-04 23:32:18 \n", + "6 3450万元卖参股公司0.23%股权 2名独董投弃权票 诚迈科技收关注函 2022-01-04 23:25:38 \n", + "7 天津《企业开办登记规范》地方标准发布 助企业开办1个工作日办结 2022-01-04 23:22:40 \n", + "8 我国将实施企业信用风险分类管理 2022-01-04 23:22:27 \n", + "9 德国去年12月失业率降至5.2% 2022-01-04 23:21:18 \n", + "10 元旦假期南京共出行约1197.89万人次 2022-01-04 23:16:38 \n", + "11 统计上严重失信企业信息公示:柳州市金百汇激光技术有限责任公司 2022-01-04 23:10:35 \n", + "12 关于对浙江天铁实业股份有限公司及相关人员采取出具警示函措施的决定 2022-01-04 23:10:26 \n", + "13 就在明天,2022年首只银行股将申购 2022-01-04 23:08:43 \n", + "14 近15亿元资产1元钱就卖?交易所问询,公司回应来了 2022-01-04 23:08:25 \n", + "15 冲击高端市场迈出第一步 realme发布真我GT2 Pro旗舰手机 2022-01-04 23:07:37 \n", + "16 倒计时一个月!北京冬奥会开幕临近,概念股提前起飞,这些上市公司积极备战 2022-01-04 23:07:36 \n", + "17 倒计时一个月!北京冬奥会开幕临近,概念股提前起飞,这些上市公司积极备战 2022-01-04 23:05:47 \n", + "18 企业实施碳中和战略要把握五个关键点 2022-01-04 23:05:06 \n", + "19 气候投融资试点工作方案发布地方 政府可从五大方面推进 2022-01-04 23:04:49 \n", + "20 网信办等4部门规范管理算法推荐 未来算法产业方向明朗 2022-01-04 23:04:04 \n", + "21 绿色投资不是情怀 券商ESG行动需不断突破 2022-01-04 23:03:31 \n", + "22 中药掀涨停潮CXO遭遇重挫 系资金切换过激所致 2022-01-04 23:03:20 \n", + "23 TCL发布第一款笔记本电脑,售价349美元主打轻便与性价比 2022-01-04 23:01:53 \n", + "24 美股三大股指涨跌不一 热门中概股持续走低掌门教育盘中跌超21% 2022-01-04 23:00:31 \n", + "25 2.16亿股!中国泛海转让民生银行股权,所持泛海控股超1亿股也将被拍卖 2022-01-04 22:56:55 \n", + "26 柳州市举办先进激光技术应用论坛 2022-01-04 22:56:09 \n", + "27 喜讯!建设集团中标贵州省城乡和住房建设厅两个磷石膏研发科技项目 2022-01-04 22:55:32 \n", + "28 盘点2021家电行业:回购股票激励员工、多元化布局追“风口” 2022-01-04 22:52:40 \n", + "29 泰康系减持阳光城股份亏损超16亿元 接盘方实力存疑 2022-01-04 22:52:32 \n", + "30 各地政策密集落地,A股氢燃料汽车板块未来的潜力点在哪? 2022-01-04 22:52:24 \n", + "31 【2022年行业展望】汽车金融公司:资本充足,盈利性强,资产质量好,信用展望稳定 2022-01-04 22:52:10 \n", + "32 公安部新修制订3部部门规章 2022-01-04 22:51:56 \n", + "33 市政协委员欧云崧:尽快推广实施共享单车自主划定禁停区 2022-01-04 22:51:01 \n", + "34 仁桥资产夏俊杰:2022年最佳风险收益比的机会在港股 2022-01-04 22:47:55 \n", + "35 进化论资产王一平:2022年从四个方向挖掘投资机会 2022-01-04 22:47:50 \n", + "36 元旦档总票房超10亿收官,12部电影将激战春节档 2022-01-04 22:45:16 \n", + "37 美国酒商激战进行时 2022-01-04 22:44:47 \n", + "38 中集分拆化工装备板块上市 中集安瑞环科闯关创业板 2022-01-04 22:43:46 \n", + "39 新年首个交易日“宁王”跌3.25%,新能源行情是否结束了 2022-01-04 22:39:43 \n", + "40 把怀柔科学城打造成“梦之城”,政协委员谈“科技创新中心建设” 2022-01-04 22:36:21 \n", + "41 银联网络元旦消费金额创新高 2022-01-04 22:34:47 \n", + "42 美国5年期国债收益率创2020年2月来最高 市场愈发坚定美联储加息预期 2022-01-04 22:34:14 \n", + "43 2022年北交所打新第一股来了!募资约1.54亿元,吸引9家战投认购 2022-01-04 22:33:20 \n", + "44 北交所首单重大资产重组出炉!中航泰达拟4.68亿元参与北方稀土子公司混改,明日起复牌 2022-01-04 22:33:00 \n", + "45 车企争秀2021年新能源“成绩单”!特斯拉全球交付量超93万辆,“蔚小理”年销已逼近10万辆关口 2022-01-04 22:31:26 \n", + "46 再次押注海外休闲游戏,字节跳动是如何布局游戏行业投资的 2022-01-04 22:30:50 \n", + "47 中长期利率趋稳 2022-01-04 22:28:57 \n", + "48 滴血验癌公司创始人被定罪,指控其犯有电信欺诈罪 2022-01-04 22:28:03 \n", + "49 HMVOD视频拟购入泰国节目 2022-01-04 22:27:07 \n", + "50 国务院批准:在海南这个地方开展高水平开放试点 2022-01-04 22:23:36 \n", + "51 5日零时起,东莞市全域均为低风险地区 2022-01-04 22:22:27 \n", + "52 缺芯”压力仍存,大众在华转型进入“阵痛期 2022-01-04 22:22:19 \n", + "53 全国首例!苏州立案查处一起全国碳市场碳排放配额未按期履约案件 2022-01-04 22:22:00 \n", + "54 分析人士:印尼禁止煤炭出口对我国影响有限 2022-01-04 22:21:27 \n", + "55 如祺出行获战略投资,车企布局出行市场为哪般 2022-01-04 22:21:26 \n", + "56 美国过去一周激增新冠确诊病例336万 美媒:数据“仍然被低估了” 2022-01-04 22:20:38 \n", + "57 国家发展改革委特别安排海南专项100亿元 2022-01-04 22:17:51 \n", + "58 牧原、双汇牵手:多方看好的“联姻”为何迟到10年 2022-01-04 22:17:32 \n", + "59 发起“全民热身月” 赖茅跨界体育深耕消费市场 2022-01-04 22:15:47 \n", + "\n", + " src \n", + "0 同花顺 \n", + "1 同花顺 \n", + "2 同花顺 \n", + "3 同花顺 \n", + "4 同花顺 \n", + "5 同花顺 \n", + "6 新浪财经 \n", + "7 同花顺 \n", + "8 同花顺 \n", + "9 同花顺 \n", + "10 同花顺 \n", + "11 同花顺 \n", + "12 同花顺 \n", + "13 新浪财经 \n", + "14 新浪财经 \n", + "15 同花顺 \n", + "16 新浪财经 \n", + "17 同花顺 \n", + "18 同花顺 \n", + "19 同花顺 \n", + "20 同花顺 \n", + "21 同花顺 \n", + "22 同花顺 \n", + "23 新浪财经 \n", + "24 凤凰财经 \n", + "25 新浪财经 \n", + "26 同花顺 \n", + "27 同花顺 \n", + "28 同花顺 \n", + "29 同花顺 \n", + "30 新浪财经 \n", + "31 同花顺 \n", + "32 同花顺 \n", + "33 同花顺 \n", + "34 同花顺 \n", + "35 同花顺 \n", + "36 同花顺 \n", + "37 同花顺 \n", + "38 新浪财经 \n", + "39 新浪财经 \n", + "40 同花顺 \n", + "41 同花顺 \n", + "42 同花顺 \n", + "43 新浪财经 \n", + "44 新浪财经 \n", + "45 同花顺 \n", + "46 同花顺 \n", + "47 同花顺 \n", + "48 同花顺 \n", + "49 同花顺 \n", + "50 同花顺 \n", + "51 同花顺 \n", + "52 同花顺 \n", + "53 同花顺 \n", + "54 同花顺 \n", + "55 同花顺 \n", + "56 同花顺 \n", + "57 同花顺 \n", + "58 新浪财经 \n", + "59 同花顺 " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "news_downloader.dataframe" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "finrl", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "afd6dc03c9be451573fc2885de79a969af6a24a159f11a3ead741ab7a9ff405f" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/FinNLP/test/Data_Sources_twitter.ipynb b/FinNLP/test/Data_Sources_twitter.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..a63deaf04c9726b33d943e27d350a83bc6ca897f --- /dev/null +++ b/FinNLP/test/Data_Sources_twitter.ipynb @@ -0,0 +1,569 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from finnlp.data_sources.social_media.twitter import Twitter_Downloader" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Config" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "stock = \"AAPL\"\n", + "start_date = \"2023-01-01\"\n", + "end_date = \"2023-01-05\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Downloader" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "downloader = Twitter_Downloader()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f95d73d68fff4354aadfd0482bb52952", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
created_atidid_strfull_texttruncateddisplay_text_rangeentitiesextended_entitiessourcein_reply_to_status_id...retweetedpossibly_sensitivepossibly_sensitive_editablelangsupplemental_languageself_threadquoted_status_idquoted_status_id_strquoted_status_permalinkcard
02023-01-01 01:25:43+00:00160936018469415731216093601846941573122022 was the birth this movement. 2023 is when...False[0, 147]{'hashtags': [{'text': 'SPY', 'indices': [97, ...NaN<a href=\"https://mobile.twitter.com\" rel=\"nofo...1609360182714241024...FalseNaNNaNenNone{'id': 1609360176640925699, 'id_str': '1609360...NaNNaNNaNNaN
12023-01-01 03:20:49+00:0016093891512538357771609389151253835777たくさんコメントありがとうございました☺️False[0, 21]{'hashtags': [], 'symbols': [], 'user_mentions...NaN<a href=\"http://twitter.com/download/iphone\" r...None...FalseNaNNaNjaNoneNaNNaNNaNNaNNaN
22023-01-01 04:21:54+00:0016094045228033638461609404522803363846The fall of Apple iphone market share.\\n- peak...False[0, 212]{'hashtags': [], 'symbols': [{'text': 'AAPL', ...{'media': [{'id': 1609404518500032514, 'id_str...<a href=\"http://twitter.com/download/iphone\" r...None...FalseFalseTrueenNone{'id': 1609404522803363846, 'id_str': '1609404...NaNNaNNaNNaN
32023-01-01 04:32:23+00:0016094071636714004481609407163671400448Apple iphone market share peaked in H1 2009 an...False[0, 160]{'hashtags': [], 'symbols': [{'text': 'AAPL', ...{'media': [{'id': 1609407158696972289, 'id_str...<a href=\"http://twitter.com/download/iphone\" r...1609404522803363846...FalseFalseTrueenNone{'id': 1609404522803363846, 'id_str': '1609404...NaNNaNNaNNaN
42023-01-01 04:43:47+00:0016094100327347118091609410032734711809That sounds impossible if we look at how fast ...False[0, 272]{'hashtags': [{'text': 'iPhone', 'indices': [2...{'media': [{'id': 1609410028653645824, 'id_str...<a href=\"http://twitter.com/download/iphone\" r...1609407163671400448...FalseFalseTrueenNone{'id': 1609404522803363846, 'id_str': '1609404...NaNNaNNaNNaN
..................................................................
792023-01-04 21:45:11+00:0016107542370041897101610754237004189710APPLE $AAPL TO SIGN UP LUXSHARE TO PRODUCE IPH...False[0, 64]{'hashtags': [], 'symbols': [{'text': 'AAPL', ...NaN<a href=\"https://mobile.twitter.com\" rel=\"nofo...None...FalseNaNNaNenNoneNaNNaNNaNNaNNaN
802023-01-04 22:21:45+00:0016107634420921835851610763442092183585$AAPL https://t.co/Fb8UbPUy9SFalse[0, 5]{'hashtags': [], 'symbols': [{'text': 'AAPL', ...{'media': [{'id': 1610763438053068835, 'id_str...<a href=\"http://twitter.com/download/iphone\" r...None...FalseFalseTrueplNoneNaNNaNNaNNaNNaN
812023-01-04 22:53:05+00:0016107713243553464321610771324355346432$AAPL This doesn't fix their demand issues fol...False[0, 49]{'hashtags': [], 'symbols': [{'text': 'AAPL', ...{'media': [{'id': 1610771243019689984, 'id_str...<a href=\"https://mobile.twitter.com\" rel=\"nofo...None...FalseFalseTrueenNoneNaNNaNNaNNaNNaN
822023-01-04 23:10:20+00:0016107756689245839361610775668924583936These TOP companies have cash in the bank!\\n$A...False[0, 173]{'hashtags': [], 'symbols': [{'text': 'AAPL', ...NaN<a href=\"http://twitter.com/download/iphone\" r...None...FalseNaNNaNenNoneNaNNaNNaNNaNNaN
832023-01-04 23:10:46+00:0016107757770830069761610775777083006976Darvas strategy. \\n\\nPart 15• \\n\\nAlways speak...False[0, 122]{'hashtags': [], 'symbols': [{'text': 'MSFT', ...{'media': [{'id': 1610775771181682690, 'id_str...<a href=\"http://twitter.com/download/iphone\" r...None...FalseFalseTrueenNone{'id': 1610775777083006976, 'id_str': '1610775...NaNNaNNaNNaN
\n", + "

84 rows × 38 columns

\n", + "" + ], + "text/plain": [ + " created_at id id_str \\\n", + "0 2023-01-01 01:25:43+00:00 1609360184694157312 1609360184694157312 \n", + "1 2023-01-01 03:20:49+00:00 1609389151253835777 1609389151253835777 \n", + "2 2023-01-01 04:21:54+00:00 1609404522803363846 1609404522803363846 \n", + "3 2023-01-01 04:32:23+00:00 1609407163671400448 1609407163671400448 \n", + "4 2023-01-01 04:43:47+00:00 1609410032734711809 1609410032734711809 \n", + ".. ... ... ... \n", + "79 2023-01-04 21:45:11+00:00 1610754237004189710 1610754237004189710 \n", + "80 2023-01-04 22:21:45+00:00 1610763442092183585 1610763442092183585 \n", + "81 2023-01-04 22:53:05+00:00 1610771324355346432 1610771324355346432 \n", + "82 2023-01-04 23:10:20+00:00 1610775668924583936 1610775668924583936 \n", + "83 2023-01-04 23:10:46+00:00 1610775777083006976 1610775777083006976 \n", + "\n", + " full_text truncated \\\n", + "0 2022 was the birth this movement. 2023 is when... False \n", + "1 たくさんコメントありがとうございました☺️ False \n", + "2 The fall of Apple iphone market share.\\n- peak... False \n", + "3 Apple iphone market share peaked in H1 2009 an... False \n", + "4 That sounds impossible if we look at how fast ... False \n", + ".. ... ... \n", + "79 APPLE $AAPL TO SIGN UP LUXSHARE TO PRODUCE IPH... False \n", + "80 $AAPL https://t.co/Fb8UbPUy9S False \n", + "81 $AAPL This doesn't fix their demand issues fol... False \n", + "82 These TOP companies have cash in the bank!\\n$A... False \n", + "83 Darvas strategy. \\n\\nPart 15• \\n\\nAlways speak... False \n", + "\n", + " display_text_range entities \\\n", + "0 [0, 147] {'hashtags': [{'text': 'SPY', 'indices': [97, ... \n", + "1 [0, 21] {'hashtags': [], 'symbols': [], 'user_mentions... \n", + "2 [0, 212] {'hashtags': [], 'symbols': [{'text': 'AAPL', ... \n", + "3 [0, 160] {'hashtags': [], 'symbols': [{'text': 'AAPL', ... \n", + "4 [0, 272] {'hashtags': [{'text': 'iPhone', 'indices': [2... \n", + ".. ... ... \n", + "79 [0, 64] {'hashtags': [], 'symbols': [{'text': 'AAPL', ... \n", + "80 [0, 5] {'hashtags': [], 'symbols': [{'text': 'AAPL', ... \n", + "81 [0, 49] {'hashtags': [], 'symbols': [{'text': 'AAPL', ... \n", + "82 [0, 173] {'hashtags': [], 'symbols': [{'text': 'AAPL', ... \n", + "83 [0, 122] {'hashtags': [], 'symbols': [{'text': 'MSFT', ... \n", + "\n", + " extended_entities \\\n", + "0 NaN \n", + "1 NaN \n", + "2 {'media': [{'id': 1609404518500032514, 'id_str... \n", + "3 {'media': [{'id': 1609407158696972289, 'id_str... \n", + "4 {'media': [{'id': 1609410028653645824, 'id_str... \n", + ".. ... \n", + "79 NaN \n", + "80 {'media': [{'id': 1610763438053068835, 'id_str... \n", + "81 {'media': [{'id': 1610771243019689984, 'id_str... \n", + "82 NaN \n", + "83 {'media': [{'id': 1610775771181682690, 'id_str... \n", + "\n", + " source in_reply_to_status_id \\\n", + "0
+ + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/finetuned_model_bak/adapter_config.json b/finetuned_model_bak/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d0fe34c158e3ed229d4e62c62fd97da928d24195 --- /dev/null +++ b/finetuned_model_bak/adapter_config.json @@ -0,0 +1,27 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "THUDM/chatglm2-6b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "query_key_value" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/finetuned_model_bak/adapter_model.safetensors b/finetuned_model_bak/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..40b7f6668a543de749406e462dc4094eef437e0d --- /dev/null +++ b/finetuned_model_bak/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1feccf05fc846f355c47a4b625f1da186da240b8a196af4616e805a9f8a57742 +size 7807744 diff --git a/finetuned_model_bak/checkpoint-100/adapter_model.bin b/finetuned_model_bak/checkpoint-100/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a32abf5940a8c505eab8e9232a1b14b00408645 --- /dev/null +++ b/finetuned_model_bak/checkpoint-100/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85f126601be8f7234a5746908b03cbeda714a0018308beb02668e361dd6e4e36 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-100/optimizer.pt b/finetuned_model_bak/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a6b542609f47755084dd20516354a3a69186c1a --- /dev/null +++ b/finetuned_model_bak/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0c85ee1ab7049f38dd2dfc725f316774c609c79c51f540cce5fcb03f17af0b +size 15645387 diff --git a/finetuned_model_bak/checkpoint-100/rng_state.pth b/finetuned_model_bak/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..216639c203be534ed9adb5823109d458e618c832 --- /dev/null +++ b/finetuned_model_bak/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:136758226135b9a3d12bbd2a3fed4cdfa994fe79c7e5e50feafd75c6878f20bc +size 14645 diff --git a/finetuned_model_bak/checkpoint-100/scheduler.pt b/finetuned_model_bak/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..48b3cba7198df763af7cc990afeb9ad0f95cd069 --- /dev/null +++ b/finetuned_model_bak/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e76148a9485c8ac7665d6431bd8552f1e285c455bee93aad01b26c4c14e462 +size 1465 diff --git a/finetuned_model_bak/checkpoint-100/trainer_state.json b/finetuned_model_bak/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a164c89c285b3d256ac2ce7627117ab6837c1500 --- /dev/null +++ b/finetuned_model_bak/checkpoint-100/trainer_state.json @@ -0,0 +1,63 @@ +{ + "best_metric": 6.069918632507324, + "best_model_checkpoint": "./finetuned_model\\checkpoint-100", + "epoch": 0.2095886822111606, + "eval_steps": 50, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9116311796711424.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-100/training_args.bin b/finetuned_model_bak/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-150/adapter_model.bin b/finetuned_model_bak/checkpoint-150/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e13bca37c8fa8860752190aa20e71763f06f1e19 --- /dev/null +++ b/finetuned_model_bak/checkpoint-150/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eebf5aac42c3e2ef84b391b35b33a5d7707505165c5991a4a4dbf143c01c1a8 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-150/optimizer.pt b/finetuned_model_bak/checkpoint-150/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9dc072cccc30b5aa341ee8e9ecdff1893beb2ecc --- /dev/null +++ b/finetuned_model_bak/checkpoint-150/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed6fcb8caf30ed15c94d09c608d95c136df84baf4c6555156d42accaa8fe7c65 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-150/rng_state.pth b/finetuned_model_bak/checkpoint-150/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a774788526fabfb4fb820f336a1933ed2c17962b --- /dev/null +++ b/finetuned_model_bak/checkpoint-150/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5bb6e223405d4fc604efc7a0f343dba164b29054b8ceb727bbafe0c3d74def2 +size 14645 diff --git a/finetuned_model_bak/checkpoint-150/scheduler.pt b/finetuned_model_bak/checkpoint-150/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..20281e1bfaa9169492c0ea260f4d79f7c4a68edc --- /dev/null +++ b/finetuned_model_bak/checkpoint-150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4500ac9ad8cecf0eb8eb127b7c4a7855edd3e4e455fb2b9e313fb79561f76f83 +size 1465 diff --git a/finetuned_model_bak/checkpoint-150/trainer_state.json b/finetuned_model_bak/checkpoint-150/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..03d74912d374be0f6ec54c975290028625494fd3 --- /dev/null +++ b/finetuned_model_bak/checkpoint-150/trainer_state.json @@ -0,0 +1,78 @@ +{ + "best_metric": 6.042043685913086, + "best_model_checkpoint": "./finetuned_model\\checkpoint-150", + "epoch": 0.3143830233167409, + "eval_steps": 50, + "global_step": 150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.360314785243136e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-150/training_args.bin b/finetuned_model_bak/checkpoint-150/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-200/adapter_model.bin b/finetuned_model_bak/checkpoint-200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..14f48cc217157b2dde3b7fc4285fb6801fece0fc --- /dev/null +++ b/finetuned_model_bak/checkpoint-200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8997337c2b6f645525f3b88fd9f4ffe0944ad695ccfc092eb372759fe7ae890f +size 7820719 diff --git a/finetuned_model_bak/checkpoint-200/optimizer.pt b/finetuned_model_bak/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..464f734c46364bdf23f33b0cd60c358050ffa45d --- /dev/null +++ b/finetuned_model_bak/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ce61d63ad5eb63115e78a061cb8a3bf616b9c1700c613e89553f7ba42fb43d +size 15645387 diff --git a/finetuned_model_bak/checkpoint-200/rng_state.pth b/finetuned_model_bak/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..462353ee9c63947a2ce38206263c722e5d912494 --- /dev/null +++ b/finetuned_model_bak/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15d4315e75260dd160f0e904c7f8110d4016f7180e009a3e4b3f69249eb65e1 +size 14645 diff --git a/finetuned_model_bak/checkpoint-200/scheduler.pt b/finetuned_model_bak/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..89fbcb2d18817d29f084a11a633050751342d017 --- /dev/null +++ b/finetuned_model_bak/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:813919858239b196492c7e8bd719cef0947b3268b516e401aea4636257c3836b +size 1465 diff --git a/finetuned_model_bak/checkpoint-200/trainer_state.json b/finetuned_model_bak/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..193f677136609609954921e3931e4a5abc454e69 --- /dev/null +++ b/finetuned_model_bak/checkpoint-200/trainer_state.json @@ -0,0 +1,93 @@ +{ + "best_metric": 6.017848014831543, + "best_model_checkpoint": "./finetuned_model\\checkpoint-200", + "epoch": 0.4191773644223212, + "eval_steps": 50, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8147384183914496e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-200/training_args.bin b/finetuned_model_bak/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-250/adapter_model.bin b/finetuned_model_bak/checkpoint-250/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..485142063c0af96f6ebf8c08330c5a25435a97dd --- /dev/null +++ b/finetuned_model_bak/checkpoint-250/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31316088cd718b5e911db256dcad0f333e70f13e17ab7c003756f3c173caaf98 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-250/optimizer.pt b/finetuned_model_bak/checkpoint-250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4633c677a7f9e79fa69ab4e424fad4746a5c844 --- /dev/null +++ b/finetuned_model_bak/checkpoint-250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ea39a6a7cbd041a78229fca64edceb05f25d3f25b4ac20b82f7bc5f814b4ae +size 15645387 diff --git a/finetuned_model_bak/checkpoint-250/rng_state.pth b/finetuned_model_bak/checkpoint-250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..98344fbea135fd52ff4e7159a7bb12674072f65d --- /dev/null +++ b/finetuned_model_bak/checkpoint-250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f6cc6e18081642f4c2d3239fcf28f79e2b3770891e223bb10535640126142e4 +size 14645 diff --git a/finetuned_model_bak/checkpoint-250/scheduler.pt b/finetuned_model_bak/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..27770766edaf03f5cf765db8f3bf40260a1a5cc3 --- /dev/null +++ b/finetuned_model_bak/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:921945b175f055b04c86c60aa688b9cd76b55f75431ae5e4166445536da7ceb1 +size 1465 diff --git a/finetuned_model_bak/checkpoint-250/trainer_state.json b/finetuned_model_bak/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ba4a5c4941f222d59595b3c096d46626c6cf4d7 --- /dev/null +++ b/finetuned_model_bak/checkpoint-250/trainer_state.json @@ -0,0 +1,108 @@ +{ + "best_metric": 5.883882522583008, + "best_model_checkpoint": "./finetuned_model\\checkpoint-250", + "epoch": 0.5239717055279015, + "eval_steps": 50, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2608820617609216e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-250/training_args.bin b/finetuned_model_bak/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-300/adapter_model.bin b/finetuned_model_bak/checkpoint-300/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..88fe6dd06ae7ccecc8fafca0bd36c3784d2b2cab --- /dev/null +++ b/finetuned_model_bak/checkpoint-300/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27a89af4f6fc8c3343d11cb6bd5a43acf3b28a0d7e9c164d2168af8434c55b49 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-300/optimizer.pt b/finetuned_model_bak/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3791831439787490aef91ff9aefb2501489f1930 --- /dev/null +++ b/finetuned_model_bak/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:520756146bab499c9c80de6c2cd0c8309f812099c70111b14f9ced89ee8549d5 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-300/rng_state.pth b/finetuned_model_bak/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1909617015c2a1db86fa82fc6e15179d92fae47b --- /dev/null +++ b/finetuned_model_bak/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07a201f92318429f90cbe6cb46a6275b9e28a1c1198e6725199e54289df5bc50 +size 14645 diff --git a/finetuned_model_bak/checkpoint-300/scheduler.pt b/finetuned_model_bak/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f9eb9d29c7f736db7f715e3f15c7a80c04ffcfc --- /dev/null +++ b/finetuned_model_bak/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93a17156d2e15f3dd1299c654c63df54c2e4e0ac64f7f8286a486e3d30b5aea +size 1465 diff --git a/finetuned_model_bak/checkpoint-300/trainer_state.json b/finetuned_model_bak/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8129e9ea282f5f64093de8aa9cd1e07cd72b6113 --- /dev/null +++ b/finetuned_model_bak/checkpoint-300/trainer_state.json @@ -0,0 +1,123 @@ +{ + "best_metric": 5.672642230987549, + "best_model_checkpoint": "./finetuned_model\\checkpoint-300", + "epoch": 0.6287660466334818, + "eval_steps": 50, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.71323928498176e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-300/training_args.bin b/finetuned_model_bak/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-350/adapter_model.bin b/finetuned_model_bak/checkpoint-350/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..12f6701240ca3d847d4cf795671fd29104319f72 --- /dev/null +++ b/finetuned_model_bak/checkpoint-350/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53c6b182e751b4d73acacca1f9b41337dfda1ed8128841f8994d467faaa07059 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-350/optimizer.pt b/finetuned_model_bak/checkpoint-350/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df1b282328534433fb749914b67f76e7e0b3e3d7 --- /dev/null +++ b/finetuned_model_bak/checkpoint-350/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2943fbc0fa3f416bb40da689c1443b437a498fec1882e8dd956c00cf9343583 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-350/rng_state.pth b/finetuned_model_bak/checkpoint-350/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..00b12bc8af081a57257a24aaf7aeee1bd6610aa3 --- /dev/null +++ b/finetuned_model_bak/checkpoint-350/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2b4b1c17596ace04140c038cb11489638072dab85128e4e17be236d8e58a15c +size 14645 diff --git a/finetuned_model_bak/checkpoint-350/scheduler.pt b/finetuned_model_bak/checkpoint-350/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2fc4cfc207f4bfff3920d2b6082ba1b5e2ac4267 --- /dev/null +++ b/finetuned_model_bak/checkpoint-350/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a8d97384ebc59726260013584cefb22c8faa91a7a2e96300a2a3f1601d980e +size 1465 diff --git a/finetuned_model_bak/checkpoint-350/trainer_state.json b/finetuned_model_bak/checkpoint-350/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dabac6492988f2730bbfc00963922ffd6157c2dd --- /dev/null +++ b/finetuned_model_bak/checkpoint-350/trainer_state.json @@ -0,0 +1,138 @@ +{ + "best_metric": 5.6431684494018555, + "best_model_checkpoint": "./finetuned_model\\checkpoint-350", + "epoch": 0.7335603877390621, + "eval_steps": 50, + "global_step": 350, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.1637166491713536e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-350/training_args.bin b/finetuned_model_bak/checkpoint-350/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-350/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-400/adapter_model.bin b/finetuned_model_bak/checkpoint-400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..bfea5e9dd64bb9cf005857a2d73a65822a334347 --- /dev/null +++ b/finetuned_model_bak/checkpoint-400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b1de1481ccf65fd3da9e17d133276a1444b50ef01e8ffc0a5c74e8d4b0fb669 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-400/optimizer.pt b/finetuned_model_bak/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f55881fd9d06f396c1da2b1252ad7e76f2f8909 --- /dev/null +++ b/finetuned_model_bak/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:645a89a990b7a7decff99938060d4cb07f7f3110a9ed82acec2d58f980145527 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-400/rng_state.pth b/finetuned_model_bak/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba5dc2793aab823b63fcf2edb143db088365a2f7 --- /dev/null +++ b/finetuned_model_bak/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c68f409b285e088bbc1bc953a68f9a0148111c65803ebb57c63005bd8382da7 +size 14645 diff --git a/finetuned_model_bak/checkpoint-400/scheduler.pt b/finetuned_model_bak/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d3b0ee0521d5f9ab62781f12536994e4ea6f7e5c --- /dev/null +++ b/finetuned_model_bak/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab7b0d3ed2603be4a7614706ed987654cfedb7ca5e1f458ab8420a31002fbd45 +size 1465 diff --git a/finetuned_model_bak/checkpoint-400/trainer_state.json b/finetuned_model_bak/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8c490f6fc614454563f0b8e47099d5a890c872ee --- /dev/null +++ b/finetuned_model_bak/checkpoint-400/trainer_state.json @@ -0,0 +1,153 @@ +{ + "best_metric": 5.634847640991211, + "best_model_checkpoint": "./finetuned_model\\checkpoint-400", + "epoch": 0.8383547288446423, + "eval_steps": 50, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.617135777493811e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-400/training_args.bin b/finetuned_model_bak/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-450/adapter_model.bin b/finetuned_model_bak/checkpoint-450/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d537e79cc22f5cb24cc6e362ab191c3ea568986f --- /dev/null +++ b/finetuned_model_bak/checkpoint-450/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:383436ddef16035244cb561c87e10117ee4f5cadef98b5bb5317a94e7f2128fb +size 7820719 diff --git a/finetuned_model_bak/checkpoint-450/optimizer.pt b/finetuned_model_bak/checkpoint-450/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d624c9ac4fa772b982ef5acf9832fbadec9c5734 --- /dev/null +++ b/finetuned_model_bak/checkpoint-450/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2126102b1fc5df56c49e6de5c8b0f4df73f90ee372a775af4e5f9cc76c7c0684 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-450/rng_state.pth b/finetuned_model_bak/checkpoint-450/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5036e9674c961f76f322a3b0f70e3d7368186a58 --- /dev/null +++ b/finetuned_model_bak/checkpoint-450/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb90ab1bbb934f1062445a273b46828e3e583217618250fc926da2f93c82919c +size 14645 diff --git a/finetuned_model_bak/checkpoint-450/scheduler.pt b/finetuned_model_bak/checkpoint-450/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef1dd642fdb0b792bd81d16c841fa06fd0ac5715 --- /dev/null +++ b/finetuned_model_bak/checkpoint-450/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90308133d6265c3000c624696e1251ef527c853b82e5e04b3ec2c49a605f339b +size 1465 diff --git a/finetuned_model_bak/checkpoint-450/trainer_state.json b/finetuned_model_bak/checkpoint-450/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..14aeeb04d4558f023553fcc400cf4a953b888e89 --- /dev/null +++ b/finetuned_model_bak/checkpoint-450/trainer_state.json @@ -0,0 +1,168 @@ +{ + "best_metric": 5.629579544067383, + "best_model_checkpoint": "./finetuned_model\\checkpoint-450", + "epoch": 0.9431490699502227, + "eval_steps": 50, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.067469640993997e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-450/training_args.bin b/finetuned_model_bak/checkpoint-450/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-450/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-50/adapter_model.bin b/finetuned_model_bak/checkpoint-50/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..809324d2eaf31977d05b78358b8a77cb25299df6 --- /dev/null +++ b/finetuned_model_bak/checkpoint-50/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff50c351a15429e0a5ce337b6697395111949053b176e3de69481d914bb0192f +size 7820719 diff --git a/finetuned_model_bak/checkpoint-50/optimizer.pt b/finetuned_model_bak/checkpoint-50/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1c5154c0e0425197a4bd0654a775323b91edf28 --- /dev/null +++ b/finetuned_model_bak/checkpoint-50/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f17e21a7fe7c126e3c673ce26565b978254a90ca3e256a6cfc3cc9d3b89dd58b +size 15645387 diff --git a/finetuned_model_bak/checkpoint-50/rng_state.pth b/finetuned_model_bak/checkpoint-50/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b3c43eb31ff58dae11fcfbeca4e206aa739f3bc2 --- /dev/null +++ b/finetuned_model_bak/checkpoint-50/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6c5b560b0ab14db34f064351288aee7a1f3052fc91cefea2a35cb93c4a2d489 +size 14645 diff --git a/finetuned_model_bak/checkpoint-50/scheduler.pt b/finetuned_model_bak/checkpoint-50/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8ec9f0e72bd04eaadb538cb57a23929d86cec8d --- /dev/null +++ b/finetuned_model_bak/checkpoint-50/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6293109ca1375b4276f6f2f7534ccba5ed0213e3b82f2414f5055a295c64d0ae +size 1465 diff --git a/finetuned_model_bak/checkpoint-50/trainer_state.json b/finetuned_model_bak/checkpoint-50/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2abacbab568aeeeac993f8047d2f3b61ca27edf8 --- /dev/null +++ b/finetuned_model_bak/checkpoint-50/trainer_state.json @@ -0,0 +1,48 @@ +{ + "best_metric": 6.328697204589844, + "best_model_checkpoint": "./finetuned_model\\checkpoint-50", + "epoch": 0.1047943411055803, + "eval_steps": 50, + "global_step": 50, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4558155898355712.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-50/training_args.bin b/finetuned_model_bak/checkpoint-50/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-50/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-500/adapter_model.bin b/finetuned_model_bak/checkpoint-500/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..de6328c5fe6c724d33a298dc5ac7c65a4e9febaa --- /dev/null +++ b/finetuned_model_bak/checkpoint-500/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b4f560b2cb246735b275395debf818d803241fac578dc3b43814c6c8d014766 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-500/optimizer.pt b/finetuned_model_bak/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..88869d063071b2d5a26cc54fda932c3bae838786 --- /dev/null +++ b/finetuned_model_bak/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c321ce6f9686fb39204842a4c4f870b7ce2ab95b9983a457389082d2ec5230de +size 15645387 diff --git a/finetuned_model_bak/checkpoint-500/rng_state.pth b/finetuned_model_bak/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..47bf6b5a6a77dbd1a632c9a718fd2535a1d11c0e --- /dev/null +++ b/finetuned_model_bak/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c5180bbdd3b1230fe7415afeaceaddb9485dea986f4021e673be809d1014508 +size 14645 diff --git a/finetuned_model_bak/checkpoint-500/scheduler.pt b/finetuned_model_bak/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e9c4fc5169a28d046b056a66139480e70667ce0b --- /dev/null +++ b/finetuned_model_bak/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f43cf0013fde91b629a603284b6a52094f6610c14e66b27ee2b155a4f664139 +size 1465 diff --git a/finetuned_model_bak/checkpoint-500/trainer_state.json b/finetuned_model_bak/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3f91c932ace727937f633e9027edadf9087f3fc2 --- /dev/null +++ b/finetuned_model_bak/checkpoint-500/trainer_state.json @@ -0,0 +1,183 @@ +{ + "best_metric": 5.625565052032471, + "best_model_checkpoint": "./finetuned_model\\checkpoint-500", + "epoch": 1.047943411055803, + "eval_steps": 50, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.520644818144461e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-500/training_args.bin b/finetuned_model_bak/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-550/adapter_model.bin b/finetuned_model_bak/checkpoint-550/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8e37f7adc4911598de7eb1cd221c388c5974307f --- /dev/null +++ b/finetuned_model_bak/checkpoint-550/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64de9dfb75723f2401b43d70bb54044aa0e87fd4793d76055150b1e02cded597 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-550/optimizer.pt b/finetuned_model_bak/checkpoint-550/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..66b9d7fb4ac456d3475b66749b964d7b2f3486a7 --- /dev/null +++ b/finetuned_model_bak/checkpoint-550/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:227d7675af6112d002a6cc92b3133945e021355cdf1523c805e862c0022e92cc +size 15645387 diff --git a/finetuned_model_bak/checkpoint-550/rng_state.pth b/finetuned_model_bak/checkpoint-550/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3f9af67942d8cfbc10271f6eff34c338aa9dad3a --- /dev/null +++ b/finetuned_model_bak/checkpoint-550/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a022a285be8b793ef5f7b2615643771794f40574295b9361f32f66d2a2b9a04c +size 14645 diff --git a/finetuned_model_bak/checkpoint-550/scheduler.pt b/finetuned_model_bak/checkpoint-550/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1b41801622d691e753a8a463477e73eb160bba7d --- /dev/null +++ b/finetuned_model_bak/checkpoint-550/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ff371162a32ca989aa703973c254f10e3e19a8335d5bd18d13c98a3cf55c50 +size 1465 diff --git a/finetuned_model_bak/checkpoint-550/trainer_state.json b/finetuned_model_bak/checkpoint-550/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4dcc7d40f45513266cf98820461463cab5bcc69c --- /dev/null +++ b/finetuned_model_bak/checkpoint-550/trainer_state.json @@ -0,0 +1,198 @@ +{ + "best_metric": 5.622195243835449, + "best_model_checkpoint": "./finetuned_model\\checkpoint-550", + "epoch": 1.1527377521613833, + "eval_steps": 50, + "global_step": 550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.973719544812339e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-550/training_args.bin b/finetuned_model_bak/checkpoint-550/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-550/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-600/adapter_model.bin b/finetuned_model_bak/checkpoint-600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..04a472363da86798c39578d964be557eb13ad852 --- /dev/null +++ b/finetuned_model_bak/checkpoint-600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8bbe68e9df8e7a33b712aacfb4f78b0d68085aba669049532fbc01ed819de9d +size 7820719 diff --git a/finetuned_model_bak/checkpoint-600/optimizer.pt b/finetuned_model_bak/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c1676535c0036acb01f8fc5668d323f21b91f3d --- /dev/null +++ b/finetuned_model_bak/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf1a42336cf90f6eed182e4ea372d1b488fc60955352679d48820a2a007e232c +size 15645387 diff --git a/finetuned_model_bak/checkpoint-600/rng_state.pth b/finetuned_model_bak/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f1a04c5322a80e70b4bacdda1a207e9e31190c21 --- /dev/null +++ b/finetuned_model_bak/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe1c108dc095659ef153c39d8cc3ec5942ab9406ec1eed43e3c2d0d263190bd4 +size 14645 diff --git a/finetuned_model_bak/checkpoint-600/scheduler.pt b/finetuned_model_bak/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4cba1579ab0280f364a5fb194e5fa43693c9b60 --- /dev/null +++ b/finetuned_model_bak/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6e60ce51483dacbad0a002462c5f9151839c6102bb1deb6436e65c3b1c6e8f7 +size 1465 diff --git a/finetuned_model_bak/checkpoint-600/trainer_state.json b/finetuned_model_bak/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ac8b82aa7520beb8966ec796677f7ef132fa6406 --- /dev/null +++ b/finetuned_model_bak/checkpoint-600/trainer_state.json @@ -0,0 +1,213 @@ +{ + "best_metric": 5.62029504776001, + "best_model_checkpoint": "./finetuned_model\\checkpoint-600", + "epoch": 1.2575320932669636, + "eval_steps": 50, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.427985327202304e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-600/training_args.bin b/finetuned_model_bak/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-650/adapter_model.bin b/finetuned_model_bak/checkpoint-650/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6d360775478d7163be7b8af2c0828ac9a110396 --- /dev/null +++ b/finetuned_model_bak/checkpoint-650/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6fa8ccb0359f2d8a9ce3c9632ad9df526aa356a80df660ad5f23abb32b4a395 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-650/optimizer.pt b/finetuned_model_bak/checkpoint-650/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e736d304b7fb30fb159e24ef23c592a4d381228d --- /dev/null +++ b/finetuned_model_bak/checkpoint-650/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0019d1816aac880bc03d23679f5f813575d26db157dd7d672ffbd488387e9b1f +size 15645387 diff --git a/finetuned_model_bak/checkpoint-650/rng_state.pth b/finetuned_model_bak/checkpoint-650/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..596ed4d64a7f3b1133f62af9e694b002b9da9f86 --- /dev/null +++ b/finetuned_model_bak/checkpoint-650/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23aa4955451e0a4f2f3d43c5cd90101741d79a1c2ee399d4419c26e0ab4468c8 +size 14645 diff --git a/finetuned_model_bak/checkpoint-650/scheduler.pt b/finetuned_model_bak/checkpoint-650/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5851a7f564437b7754353c70c6d89782149ee12f --- /dev/null +++ b/finetuned_model_bak/checkpoint-650/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e262a2d3d2610d5f28bebf3b5b216a3ac26d8586a4e5f5b79231fd40dc7d09 +size 1465 diff --git a/finetuned_model_bak/checkpoint-650/trainer_state.json b/finetuned_model_bak/checkpoint-650/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2ae151948b8d50cd180521102c31b14526538005 --- /dev/null +++ b/finetuned_model_bak/checkpoint-650/trainer_state.json @@ -0,0 +1,228 @@ +{ + "best_metric": 5.617985725402832, + "best_model_checkpoint": "./finetuned_model\\checkpoint-650", + "epoch": 1.3623264343725439, + "eval_steps": 50, + "global_step": 650, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.87940979594199e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-650/training_args.bin b/finetuned_model_bak/checkpoint-650/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-650/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-700/adapter_model.bin b/finetuned_model_bak/checkpoint-700/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d822b32e68a6c29f2a8ce6c0c4bfbaa7595dd9ad --- /dev/null +++ b/finetuned_model_bak/checkpoint-700/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf9835124b8b8d6ef26df0b08c30d8d02462d693fe4639097693299f49c3fb20 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-700/optimizer.pt b/finetuned_model_bak/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ceacaacf3d1dbe98f31c7fe5f55a925061647aac --- /dev/null +++ b/finetuned_model_bak/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36be4ae340d840b6da9d65dc51af5c26e380f7e98c1bdda1f7797887833cb785 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-700/rng_state.pth b/finetuned_model_bak/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e70c38d6495e65aa90a74705534d6b7343132798 --- /dev/null +++ b/finetuned_model_bak/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a843270e4b33fb969af90c3d570bcf634aa3bdd4ff0dfc7e10c5961f557ab519 +size 14645 diff --git a/finetuned_model_bak/checkpoint-700/scheduler.pt b/finetuned_model_bak/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..21ee8f68fe14d00ab35212fbacf45f1c2810dad2 --- /dev/null +++ b/finetuned_model_bak/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db8b650e3cf05b5a0c019cc79fb3152db519584ba1bb3f8f5a2aaef450bbfbab +size 1465 diff --git a/finetuned_model_bak/checkpoint-700/trainer_state.json b/finetuned_model_bak/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7adcf2dc22d1ece0ab3dbdebf0df5584ef8dafe1 --- /dev/null +++ b/finetuned_model_bak/checkpoint-700/trainer_state.json @@ -0,0 +1,243 @@ +{ + "best_metric": 5.6168742179870605, + "best_model_checkpoint": "./finetuned_model\\checkpoint-700", + "epoch": 1.4671207754781241, + "eval_steps": 50, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + }, + { + "epoch": 1.4671207754781241, + "grad_norm": 0.5790793895721436, + "learning_rate": 2.711864406779661e-05, + "loss": 5.5992, + "step": 700 + }, + { + "epoch": 1.4671207754781241, + "eval_loss": 5.6168742179870605, + "eval_runtime": 89.8652, + "eval_samples_per_second": 42.486, + "eval_steps_per_second": 5.319, + "step": 700 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.330619013647565e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-700/training_args.bin b/finetuned_model_bak/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-750/adapter_model.bin b/finetuned_model_bak/checkpoint-750/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a5864b48cdf706dda700caae599ae2660124dd4 --- /dev/null +++ b/finetuned_model_bak/checkpoint-750/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d23057bbf80dc64d62fb1c87a49333cbabb37867eb23649e1482a3474e32ef3 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-750/optimizer.pt b/finetuned_model_bak/checkpoint-750/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..176306445985c472fef75ab146775bde6ec5e582 --- /dev/null +++ b/finetuned_model_bak/checkpoint-750/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:578cefc3d7c4f6e8cff3233002a57c588792c8cfe195f0e2399eded1f5abbcc1 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-750/rng_state.pth b/finetuned_model_bak/checkpoint-750/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ca8ba00387067856c9ecdb00de450999f8c5283 --- /dev/null +++ b/finetuned_model_bak/checkpoint-750/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a5dfae5db88b80d02380724963ed7eff0edf22aa2895ab911d305c687c4f8a3 +size 14645 diff --git a/finetuned_model_bak/checkpoint-750/scheduler.pt b/finetuned_model_bak/checkpoint-750/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..09f528513f5aff1d82841e8c8361942503f817f7 --- /dev/null +++ b/finetuned_model_bak/checkpoint-750/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac3c4604ac96222a3d7a67b93b67847f4ae6ac7aad8e15c1ddef6d323d5e6749 +size 1465 diff --git a/finetuned_model_bak/checkpoint-750/trainer_state.json b/finetuned_model_bak/checkpoint-750/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33d2e1c55ad75b44b346fd9fdda2436b8cf1dc26 --- /dev/null +++ b/finetuned_model_bak/checkpoint-750/trainer_state.json @@ -0,0 +1,258 @@ +{ + "best_metric": 5.614803791046143, + "best_model_checkpoint": "./finetuned_model\\checkpoint-750", + "epoch": 1.5719151165837046, + "eval_steps": 50, + "global_step": 750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + }, + { + "epoch": 1.4671207754781241, + "grad_norm": 0.5790793895721436, + "learning_rate": 2.711864406779661e-05, + "loss": 5.5992, + "step": 700 + }, + { + "epoch": 1.4671207754781241, + "eval_loss": 5.6168742179870605, + "eval_runtime": 89.8652, + "eval_samples_per_second": 42.486, + "eval_steps_per_second": 5.319, + "step": 700 + }, + { + "epoch": 1.5719151165837046, + "grad_norm": 0.2715691924095154, + "learning_rate": 2.1822033898305087e-05, + "loss": 5.5957, + "step": 750 + }, + { + "epoch": 1.5719151165837046, + "eval_loss": 5.614803791046143, + "eval_runtime": 89.8257, + "eval_samples_per_second": 42.505, + "eval_steps_per_second": 5.321, + "step": 750 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.776891807637504e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-750/training_args.bin b/finetuned_model_bak/checkpoint-750/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-750/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-800/adapter_model.bin b/finetuned_model_bak/checkpoint-800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2a8b32d1f3464f332b0a577b23da80d4d1ca9a57 --- /dev/null +++ b/finetuned_model_bak/checkpoint-800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03357c6aa1ef607420db47909c0630310a96067b80d451e1afc541fe960b3b0b +size 7820719 diff --git a/finetuned_model_bak/checkpoint-800/optimizer.pt b/finetuned_model_bak/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..46bea3a56fa6462c567fa3a3cd087e76263325de --- /dev/null +++ b/finetuned_model_bak/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e431baf722f0caabedb0f49e63d38cae34622e897d71e12761ba6cf2b71edf4e +size 15645387 diff --git a/finetuned_model_bak/checkpoint-800/rng_state.pth b/finetuned_model_bak/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..09c1cfce940cde5179ab52ccc9d24d7fbb8d8d8e --- /dev/null +++ b/finetuned_model_bak/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5af9b348c7a29a5a2e0440ed7da4a5c13b8c9be566400c034836654fcb450554 +size 14645 diff --git a/finetuned_model_bak/checkpoint-800/scheduler.pt b/finetuned_model_bak/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..80d3cf68e890f45a3da4765f80b23a4aa733198a --- /dev/null +++ b/finetuned_model_bak/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b26de0bf4c315fb494132dee7686bd1c1786953d132fdf43ff510f155188a38 +size 1465 diff --git a/finetuned_model_bak/checkpoint-800/trainer_state.json b/finetuned_model_bak/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae6b5bb819c7865fa58478f5bf81481308059b04 --- /dev/null +++ b/finetuned_model_bak/checkpoint-800/trainer_state.json @@ -0,0 +1,273 @@ +{ + "best_metric": 5.614142417907715, + "best_model_checkpoint": "./finetuned_model\\checkpoint-800", + "epoch": 1.676709457689285, + "eval_steps": 50, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + }, + { + "epoch": 1.4671207754781241, + "grad_norm": 0.5790793895721436, + "learning_rate": 2.711864406779661e-05, + "loss": 5.5992, + "step": 700 + }, + { + "epoch": 1.4671207754781241, + "eval_loss": 5.6168742179870605, + "eval_runtime": 89.8652, + "eval_samples_per_second": 42.486, + "eval_steps_per_second": 5.319, + "step": 700 + }, + { + "epoch": 1.5719151165837046, + "grad_norm": 0.2715691924095154, + "learning_rate": 2.1822033898305087e-05, + "loss": 5.5957, + "step": 750 + }, + { + "epoch": 1.5719151165837046, + "eval_loss": 5.614803791046143, + "eval_runtime": 89.8257, + "eval_samples_per_second": 42.505, + "eval_steps_per_second": 5.321, + "step": 750 + }, + { + "epoch": 1.676709457689285, + "grad_norm": 0.3297811448574066, + "learning_rate": 1.652542372881356e-05, + "loss": 5.5973, + "step": 800 + }, + { + "epoch": 1.676709457689285, + "eval_loss": 5.614142417907715, + "eval_runtime": 89.7141, + "eval_samples_per_second": 42.557, + "eval_steps_per_second": 5.328, + "step": 800 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.22848847720448e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-800/training_args.bin b/finetuned_model_bak/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-850/adapter_model.bin b/finetuned_model_bak/checkpoint-850/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..50838c4596ca173558bba7f03ab0a9a22b2af87f --- /dev/null +++ b/finetuned_model_bak/checkpoint-850/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0393b24b43cc80877c1fa976de2062a9fb28b70c0d4ec579e8ae24ebc4a2368 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-850/optimizer.pt b/finetuned_model_bak/checkpoint-850/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bf480ebff2c511df869b5d8fb837dbe47d5ba2b9 --- /dev/null +++ b/finetuned_model_bak/checkpoint-850/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:258682781863edeb250fd0efacf14e25b25246bce22374f73b6dced613be789f +size 15645387 diff --git a/finetuned_model_bak/checkpoint-850/rng_state.pth b/finetuned_model_bak/checkpoint-850/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e659b0c9d9f31995a2265881898dd24b12de2b20 --- /dev/null +++ b/finetuned_model_bak/checkpoint-850/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f3e990852c2971a9be3791c97ac7a0471b07c09a7ad792a6a61510a887204f2 +size 14645 diff --git a/finetuned_model_bak/checkpoint-850/scheduler.pt b/finetuned_model_bak/checkpoint-850/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe13c263a5221c1253054df3a0a8d6c340c1f672 --- /dev/null +++ b/finetuned_model_bak/checkpoint-850/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df9c6677c9d467e7c25b43f6812528e44b46ec6b80d1c12604df13acef577937 +size 1465 diff --git a/finetuned_model_bak/checkpoint-850/trainer_state.json b/finetuned_model_bak/checkpoint-850/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3ec1dcce2da124f794b7cb499931b6bc89fc9429 --- /dev/null +++ b/finetuned_model_bak/checkpoint-850/trainer_state.json @@ -0,0 +1,288 @@ +{ + "best_metric": 5.613337516784668, + "best_model_checkpoint": "./finetuned_model\\checkpoint-850", + "epoch": 1.7815037987948652, + "eval_steps": 50, + "global_step": 850, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + }, + { + "epoch": 1.4671207754781241, + "grad_norm": 0.5790793895721436, + "learning_rate": 2.711864406779661e-05, + "loss": 5.5992, + "step": 700 + }, + { + "epoch": 1.4671207754781241, + "eval_loss": 5.6168742179870605, + "eval_runtime": 89.8652, + "eval_samples_per_second": 42.486, + "eval_steps_per_second": 5.319, + "step": 700 + }, + { + "epoch": 1.5719151165837046, + "grad_norm": 0.2715691924095154, + "learning_rate": 2.1822033898305087e-05, + "loss": 5.5957, + "step": 750 + }, + { + "epoch": 1.5719151165837046, + "eval_loss": 5.614803791046143, + "eval_runtime": 89.8257, + "eval_samples_per_second": 42.505, + "eval_steps_per_second": 5.321, + "step": 750 + }, + { + "epoch": 1.676709457689285, + "grad_norm": 0.3297811448574066, + "learning_rate": 1.652542372881356e-05, + "loss": 5.5973, + "step": 800 + }, + { + "epoch": 1.676709457689285, + "eval_loss": 5.614142417907715, + "eval_runtime": 89.7141, + "eval_samples_per_second": 42.557, + "eval_steps_per_second": 5.328, + "step": 800 + }, + { + "epoch": 1.7815037987948652, + "grad_norm": 0.46387508511543274, + "learning_rate": 1.1228813559322036e-05, + "loss": 5.5946, + "step": 850 + }, + { + "epoch": 1.7815037987948652, + "eval_loss": 5.613337516784668, + "eval_runtime": 89.7317, + "eval_samples_per_second": 42.549, + "eval_steps_per_second": 5.327, + "step": 850 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.675378324158874e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-850/training_args.bin b/finetuned_model_bak/checkpoint-850/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-850/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-900/adapter_model.bin b/finetuned_model_bak/checkpoint-900/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..5c2f8afc03861c1860ad5fad58a00d77d3946ba4 --- /dev/null +++ b/finetuned_model_bak/checkpoint-900/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60f5d6cd747956e03faacad9e3fc936760615a58daa2a6059b8e56746d94e57d +size 7820719 diff --git a/finetuned_model_bak/checkpoint-900/optimizer.pt b/finetuned_model_bak/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ba06b46780de2f417ab42776f79fcbd0f07af75 --- /dev/null +++ b/finetuned_model_bak/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d324d8bc2f8a2ebb28450cd6d5076cccc1ff768a4b440dda4038ae07887d83a +size 15645387 diff --git a/finetuned_model_bak/checkpoint-900/rng_state.pth b/finetuned_model_bak/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..72ef87c5a85fbefde29b70ed237437988a678e7b --- /dev/null +++ b/finetuned_model_bak/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a38ae24d888667b95c75c7e8babd310033b94c85642812be08de488f9f0a0eaa +size 14645 diff --git a/finetuned_model_bak/checkpoint-900/scheduler.pt b/finetuned_model_bak/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..58a14b0b718012ec60801c78303ad5c7164ef3c9 --- /dev/null +++ b/finetuned_model_bak/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f33feac762cb84302eabcc700ba7a4e0b0c67e8413e9341b34767b5426d8a1 +size 1465 diff --git a/finetuned_model_bak/checkpoint-900/trainer_state.json b/finetuned_model_bak/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b1836607edf2cb8bea5d7be6bb80c5cf1332bd70 --- /dev/null +++ b/finetuned_model_bak/checkpoint-900/trainer_state.json @@ -0,0 +1,303 @@ +{ + "best_metric": 5.612890243530273, + "best_model_checkpoint": "./finetuned_model\\checkpoint-900", + "epoch": 1.8862981399004455, + "eval_steps": 50, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + }, + { + "epoch": 1.4671207754781241, + "grad_norm": 0.5790793895721436, + "learning_rate": 2.711864406779661e-05, + "loss": 5.5992, + "step": 700 + }, + { + "epoch": 1.4671207754781241, + "eval_loss": 5.6168742179870605, + "eval_runtime": 89.8652, + "eval_samples_per_second": 42.486, + "eval_steps_per_second": 5.319, + "step": 700 + }, + { + "epoch": 1.5719151165837046, + "grad_norm": 0.2715691924095154, + "learning_rate": 2.1822033898305087e-05, + "loss": 5.5957, + "step": 750 + }, + { + "epoch": 1.5719151165837046, + "eval_loss": 5.614803791046143, + "eval_runtime": 89.8257, + "eval_samples_per_second": 42.505, + "eval_steps_per_second": 5.321, + "step": 750 + }, + { + "epoch": 1.676709457689285, + "grad_norm": 0.3297811448574066, + "learning_rate": 1.652542372881356e-05, + "loss": 5.5973, + "step": 800 + }, + { + "epoch": 1.676709457689285, + "eval_loss": 5.614142417907715, + "eval_runtime": 89.7141, + "eval_samples_per_second": 42.557, + "eval_steps_per_second": 5.328, + "step": 800 + }, + { + "epoch": 1.7815037987948652, + "grad_norm": 0.46387508511543274, + "learning_rate": 1.1228813559322036e-05, + "loss": 5.5946, + "step": 850 + }, + { + "epoch": 1.7815037987948652, + "eval_loss": 5.613337516784668, + "eval_runtime": 89.7317, + "eval_samples_per_second": 42.549, + "eval_steps_per_second": 5.327, + "step": 850 + }, + { + "epoch": 1.8862981399004455, + "grad_norm": 0.1856226772069931, + "learning_rate": 5.932203389830509e-06, + "loss": 5.5969, + "step": 900 + }, + { + "epoch": 1.8862981399004455, + "eval_loss": 5.612890243530273, + "eval_runtime": 89.6957, + "eval_samples_per_second": 42.566, + "eval_steps_per_second": 5.329, + "step": 900 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.129242304618496e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-900/training_args.bin b/finetuned_model_bak/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-950/adapter_model.bin b/finetuned_model_bak/checkpoint-950/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b5def9c28c1018ecb89adecda8e4bdf2059503fa --- /dev/null +++ b/finetuned_model_bak/checkpoint-950/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aad7ddf15131c1a32e005a6f416cd40f90f8723d87643492667bf03506cfc259 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-950/optimizer.pt b/finetuned_model_bak/checkpoint-950/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c40b06f82cf7fb18bb10f847cc911578bc0dca53 --- /dev/null +++ b/finetuned_model_bak/checkpoint-950/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4844ab63bf3d9d09b6d21f330b413b632d8195632658ee1fc22320ea3c64942b +size 15645387 diff --git a/finetuned_model_bak/checkpoint-950/rng_state.pth b/finetuned_model_bak/checkpoint-950/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6ef988d8502071789954ea86a096d4cfcd2f3beb --- /dev/null +++ b/finetuned_model_bak/checkpoint-950/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96f5b29a9591f01b1b7b31f7f1beceb2cb93bd9477771011c6bba304c87dddf4 +size 14645 diff --git a/finetuned_model_bak/checkpoint-950/scheduler.pt b/finetuned_model_bak/checkpoint-950/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c2e218af9b627fd88acbd75771c9053f555b239 --- /dev/null +++ b/finetuned_model_bak/checkpoint-950/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acd92a743467ee48cead64fcf4b8520b5b06f4e2c05a7e2c4468fcea9564b2cf +size 1465 diff --git a/finetuned_model_bak/checkpoint-950/trainer_state.json b/finetuned_model_bak/checkpoint-950/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..45ee7b7154e3468d5ea1a1f4c8f5ef2daa529a79 --- /dev/null +++ b/finetuned_model_bak/checkpoint-950/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 5.612239837646484, + "best_model_checkpoint": "./finetuned_model\\checkpoint-950", + "epoch": 1.9910924810060258, + "eval_steps": 50, + "global_step": 950, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + }, + { + "epoch": 1.4671207754781241, + "grad_norm": 0.5790793895721436, + "learning_rate": 2.711864406779661e-05, + "loss": 5.5992, + "step": 700 + }, + { + "epoch": 1.4671207754781241, + "eval_loss": 5.6168742179870605, + "eval_runtime": 89.8652, + "eval_samples_per_second": 42.486, + "eval_steps_per_second": 5.319, + "step": 700 + }, + { + "epoch": 1.5719151165837046, + "grad_norm": 0.2715691924095154, + "learning_rate": 2.1822033898305087e-05, + "loss": 5.5957, + "step": 750 + }, + { + "epoch": 1.5719151165837046, + "eval_loss": 5.614803791046143, + "eval_runtime": 89.8257, + "eval_samples_per_second": 42.505, + "eval_steps_per_second": 5.321, + "step": 750 + }, + { + "epoch": 1.676709457689285, + "grad_norm": 0.3297811448574066, + "learning_rate": 1.652542372881356e-05, + "loss": 5.5973, + "step": 800 + }, + { + "epoch": 1.676709457689285, + "eval_loss": 5.614142417907715, + "eval_runtime": 89.7141, + "eval_samples_per_second": 42.557, + "eval_steps_per_second": 5.328, + "step": 800 + }, + { + "epoch": 1.7815037987948652, + "grad_norm": 0.46387508511543274, + "learning_rate": 1.1228813559322036e-05, + "loss": 5.5946, + "step": 850 + }, + { + "epoch": 1.7815037987948652, + "eval_loss": 5.613337516784668, + "eval_runtime": 89.7317, + "eval_samples_per_second": 42.549, + "eval_steps_per_second": 5.327, + "step": 850 + }, + { + "epoch": 1.8862981399004455, + "grad_norm": 0.1856226772069931, + "learning_rate": 5.932203389830509e-06, + "loss": 5.5969, + "step": 900 + }, + { + "epoch": 1.8862981399004455, + "eval_loss": 5.612890243530273, + "eval_runtime": 89.6957, + "eval_samples_per_second": 42.566, + "eval_steps_per_second": 5.329, + "step": 900 + }, + { + "epoch": 1.9910924810060258, + "grad_norm": 0.35210326313972473, + "learning_rate": 6.355932203389831e-07, + "loss": 5.5982, + "step": 950 + }, + { + "epoch": 1.9910924810060258, + "eval_loss": 5.612239837646484, + "eval_runtime": 89.5895, + "eval_samples_per_second": 42.617, + "eval_steps_per_second": 5.335, + "step": 950 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.585158344936653e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-950/training_args.bin b/finetuned_model_bak/checkpoint-950/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-950/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/finetuned_model_bak/checkpoint-954/adapter_model.bin b/finetuned_model_bak/checkpoint-954/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d8afc59a927c55738337ae644d81357c6c048be8 --- /dev/null +++ b/finetuned_model_bak/checkpoint-954/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d170a3bd1544cd4aae4bec59bfe1edc962b4e6469959ac8774fe2801e60daad1 +size 7820719 diff --git a/finetuned_model_bak/checkpoint-954/optimizer.pt b/finetuned_model_bak/checkpoint-954/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..784bc24a38dee7b94fd9313de0bcec60fa6d72ba --- /dev/null +++ b/finetuned_model_bak/checkpoint-954/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd08bc8db209db4090b425f519a19a4d6535d70199facd69a5a317927203ba97 +size 15645387 diff --git a/finetuned_model_bak/checkpoint-954/rng_state.pth b/finetuned_model_bak/checkpoint-954/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..51c9ed11dc0f463ce654bb4f40537b0893482b07 --- /dev/null +++ b/finetuned_model_bak/checkpoint-954/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbd58039a23ff37e64585bf93a920762e1866d710290dc135b0ff26e45a8892 +size 14645 diff --git a/finetuned_model_bak/checkpoint-954/scheduler.pt b/finetuned_model_bak/checkpoint-954/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ffa4cfca7fdb6300f7f6cf139f0e8f585845ea9 --- /dev/null +++ b/finetuned_model_bak/checkpoint-954/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:057d9203d7abc63f7df0c513d8d59ed0cd85ed64b73a59720d1ee1f66ca46fef +size 1465 diff --git a/finetuned_model_bak/checkpoint-954/trainer_state.json b/finetuned_model_bak/checkpoint-954/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ec22b47e63f4c8e415802f3e263523c57bcb7ff7 --- /dev/null +++ b/finetuned_model_bak/checkpoint-954/trainer_state.json @@ -0,0 +1,318 @@ +{ + "best_metric": 5.612239837646484, + "best_model_checkpoint": "./finetuned_model\\checkpoint-950", + "epoch": 1.9994760282944721, + "eval_steps": 50, + "global_step": 954, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1047943411055803, + "grad_norm": 2.537168025970459, + "learning_rate": 9.597457627118645e-05, + "loss": 12.7457, + "step": 50 + }, + { + "epoch": 0.1047943411055803, + "eval_loss": 6.328697204589844, + "eval_runtime": 90.5761, + "eval_samples_per_second": 42.152, + "eval_steps_per_second": 5.277, + "step": 50 + }, + { + "epoch": 0.2095886822111606, + "grad_norm": 0.616327166557312, + "learning_rate": 9.067796610169493e-05, + "loss": 6.1336, + "step": 100 + }, + { + "epoch": 0.2095886822111606, + "eval_loss": 6.069918632507324, + "eval_runtime": 90.0646, + "eval_samples_per_second": 42.392, + "eval_steps_per_second": 5.307, + "step": 100 + }, + { + "epoch": 0.3143830233167409, + "grad_norm": 0.34638333320617676, + "learning_rate": 8.538135593220339e-05, + "loss": 6.0714, + "step": 150 + }, + { + "epoch": 0.3143830233167409, + "eval_loss": 6.042043685913086, + "eval_runtime": 90.3662, + "eval_samples_per_second": 42.25, + "eval_steps_per_second": 5.29, + "step": 150 + }, + { + "epoch": 0.4191773644223212, + "grad_norm": 0.2187536656856537, + "learning_rate": 8.008474576271187e-05, + "loss": 6.0437, + "step": 200 + }, + { + "epoch": 0.4191773644223212, + "eval_loss": 6.017848014831543, + "eval_runtime": 89.6746, + "eval_samples_per_second": 42.576, + "eval_steps_per_second": 5.33, + "step": 200 + }, + { + "epoch": 0.5239717055279015, + "grad_norm": 1.946129560470581, + "learning_rate": 7.478813559322034e-05, + "loss": 5.9968, + "step": 250 + }, + { + "epoch": 0.5239717055279015, + "eval_loss": 5.883882522583008, + "eval_runtime": 89.8055, + "eval_samples_per_second": 42.514, + "eval_steps_per_second": 5.323, + "step": 250 + }, + { + "epoch": 0.6287660466334818, + "grad_norm": 1.0262460708618164, + "learning_rate": 6.949152542372882e-05, + "loss": 5.7859, + "step": 300 + }, + { + "epoch": 0.6287660466334818, + "eval_loss": 5.672642230987549, + "eval_runtime": 89.8111, + "eval_samples_per_second": 42.511, + "eval_steps_per_second": 5.322, + "step": 300 + }, + { + "epoch": 0.7335603877390621, + "grad_norm": 0.26455119252204895, + "learning_rate": 6.419491525423728e-05, + "loss": 5.6357, + "step": 350 + }, + { + "epoch": 0.7335603877390621, + "eval_loss": 5.6431684494018555, + "eval_runtime": 89.6683, + "eval_samples_per_second": 42.579, + "eval_steps_per_second": 5.331, + "step": 350 + }, + { + "epoch": 0.8383547288446423, + "grad_norm": 0.17298221588134766, + "learning_rate": 5.889830508474577e-05, + "loss": 5.6224, + "step": 400 + }, + { + "epoch": 0.8383547288446423, + "eval_loss": 5.634847640991211, + "eval_runtime": 89.8326, + "eval_samples_per_second": 42.501, + "eval_steps_per_second": 5.321, + "step": 400 + }, + { + "epoch": 0.9431490699502227, + "grad_norm": 0.41854000091552734, + "learning_rate": 5.3601694915254244e-05, + "loss": 5.6145, + "step": 450 + }, + { + "epoch": 0.9431490699502227, + "eval_loss": 5.629579544067383, + "eval_runtime": 89.8024, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 5.323, + "step": 450 + }, + { + "epoch": 1.047943411055803, + "grad_norm": 0.5269731879234314, + "learning_rate": 4.8305084745762714e-05, + "loss": 5.6116, + "step": 500 + }, + { + "epoch": 1.047943411055803, + "eval_loss": 5.625565052032471, + "eval_runtime": 89.7643, + "eval_samples_per_second": 42.534, + "eval_steps_per_second": 5.325, + "step": 500 + }, + { + "epoch": 1.1527377521613833, + "grad_norm": 0.4864833354949951, + "learning_rate": 4.300847457627119e-05, + "loss": 5.6074, + "step": 550 + }, + { + "epoch": 1.1527377521613833, + "eval_loss": 5.622195243835449, + "eval_runtime": 89.7852, + "eval_samples_per_second": 42.524, + "eval_steps_per_second": 5.324, + "step": 550 + }, + { + "epoch": 1.2575320932669636, + "grad_norm": 0.4364955425262451, + "learning_rate": 3.771186440677966e-05, + "loss": 5.605, + "step": 600 + }, + { + "epoch": 1.2575320932669636, + "eval_loss": 5.62029504776001, + "eval_runtime": 89.809, + "eval_samples_per_second": 42.512, + "eval_steps_per_second": 5.322, + "step": 600 + }, + { + "epoch": 1.3623264343725439, + "grad_norm": 0.4124000668525696, + "learning_rate": 3.241525423728814e-05, + "loss": 5.6018, + "step": 650 + }, + { + "epoch": 1.3623264343725439, + "eval_loss": 5.617985725402832, + "eval_runtime": 89.7286, + "eval_samples_per_second": 42.551, + "eval_steps_per_second": 5.327, + "step": 650 + }, + { + "epoch": 1.4671207754781241, + "grad_norm": 0.5790793895721436, + "learning_rate": 2.711864406779661e-05, + "loss": 5.5992, + "step": 700 + }, + { + "epoch": 1.4671207754781241, + "eval_loss": 5.6168742179870605, + "eval_runtime": 89.8652, + "eval_samples_per_second": 42.486, + "eval_steps_per_second": 5.319, + "step": 700 + }, + { + "epoch": 1.5719151165837046, + "grad_norm": 0.2715691924095154, + "learning_rate": 2.1822033898305087e-05, + "loss": 5.5957, + "step": 750 + }, + { + "epoch": 1.5719151165837046, + "eval_loss": 5.614803791046143, + "eval_runtime": 89.8257, + "eval_samples_per_second": 42.505, + "eval_steps_per_second": 5.321, + "step": 750 + }, + { + "epoch": 1.676709457689285, + "grad_norm": 0.3297811448574066, + "learning_rate": 1.652542372881356e-05, + "loss": 5.5973, + "step": 800 + }, + { + "epoch": 1.676709457689285, + "eval_loss": 5.614142417907715, + "eval_runtime": 89.7141, + "eval_samples_per_second": 42.557, + "eval_steps_per_second": 5.328, + "step": 800 + }, + { + "epoch": 1.7815037987948652, + "grad_norm": 0.46387508511543274, + "learning_rate": 1.1228813559322036e-05, + "loss": 5.5946, + "step": 850 + }, + { + "epoch": 1.7815037987948652, + "eval_loss": 5.613337516784668, + "eval_runtime": 89.7317, + "eval_samples_per_second": 42.549, + "eval_steps_per_second": 5.327, + "step": 850 + }, + { + "epoch": 1.8862981399004455, + "grad_norm": 0.1856226772069931, + "learning_rate": 5.932203389830509e-06, + "loss": 5.5969, + "step": 900 + }, + { + "epoch": 1.8862981399004455, + "eval_loss": 5.612890243530273, + "eval_runtime": 89.6957, + "eval_samples_per_second": 42.566, + "eval_steps_per_second": 5.329, + "step": 900 + }, + { + "epoch": 1.9910924810060258, + "grad_norm": 0.35210326313972473, + "learning_rate": 6.355932203389831e-07, + "loss": 5.5982, + "step": 950 + }, + { + "epoch": 1.9910924810060258, + "eval_loss": 5.612239837646484, + "eval_runtime": 89.5895, + "eval_samples_per_second": 42.617, + "eval_steps_per_second": 5.335, + "step": 950 + } + ], + "logging_steps": 50, + "max_steps": 954, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.622296323355443e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/finetuned_model_bak/checkpoint-954/training_args.bin b/finetuned_model_bak/checkpoint-954/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..31fcc12ed05feb6f609c6ef8776778afa425ad50 --- /dev/null +++ b/finetuned_model_bak/checkpoint-954/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd2a07fcaedafd2f497aa35369db0abcc3f13d7a5884100f7140262b1d5e121 +size 5585 diff --git a/fiqa.py b/fiqa.py new file mode 100644 index 0000000000000000000000000000000000000000..03dd806c004fced88f6054cbe1064a33bb8870b0 --- /dev/null +++ b/fiqa.py @@ -0,0 +1,85 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def add_instructions(x): + if x.format == "post": + return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}." + else: + return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + +def make_label(x): + if x < - 0.1: return "negative" + elif x >=-0.1 and x < 0.1: return "neutral" + elif x >= 0.1: return "positive" + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_fiqa(model, tokenizer, batch_size = 8, prompt_fun = None ): + dataset = load_dataset('pauri32/fiqa-2018') + dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ]) + dataset = dataset.train_test_split(0.226, seed = 42)['test'] + dataset = dataset.to_pandas() + dataset["output"] = dataset.sentiment_score.apply(make_label) + if prompt_fun is None: + dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + else: + dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) + + dataset = dataset[['sentence', 'output',"instruction"]] + dataset.columns = ["input", "output","instruction"] + dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") + + context = dataset['context'].tolist() + total_steps = dataset.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + out_text_list = [] + + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + # tokens.pop('token_type_ids') + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + dataset["out_text"] = out_text_list + dataset["new_target"] = dataset["target"].apply(change_target) + dataset["new_out"] = dataset["out_text"].apply(change_target) + + acc = accuracy_score(dataset["new_target"], dataset["new_out"]) + f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") + f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") + f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return dataset \ No newline at end of file diff --git a/fpb.py b/fpb.py new file mode 100644 index 0000000000000000000000000000000000000000..39d4730656a281fc356b53a866a812bde9aa16f1 --- /dev/null +++ b/fpb.py @@ -0,0 +1,80 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +dic = { + 0:"negative", + 1:'neutral', + 2:'positive', + } + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_fpb(model, tokenizer, batch_size = 8, prompt_fun = None ): + instructions = load_dataset("financial_phrasebank", "sentences_50agree") + instructions = instructions["train"] + instructions = instructions.train_test_split(seed = 42)['test'] + instructions = instructions.to_pandas() + instructions.columns = ["input", "output"] + instructions["output"] = instructions["output"].apply(lambda x:dic[x]) + + if prompt_fun is None: + instructions["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + else: + instructions["instruction"] = instructions.apply(prompt_fun, axis = 1) + + instructions[["context","target"]] = instructions.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{instructions['context'][0]}\n\n") + + + context = instructions['context'].tolist() + + total_steps = instructions.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + + out_text_list = [] + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + instructions["out_text"] = out_text_list + instructions["new_target"] = instructions["target"].apply(change_target) + instructions["new_out"] = instructions["out_text"].apply(change_target) + + acc = accuracy_score(instructions["new_target"], instructions["new_out"]) + f1_macro = f1_score(instructions["new_target"], instructions["new_out"], average = "macro") + f1_micro = f1_score(instructions["new_target"], instructions["new_out"], average = "micro") + f1_weighted = f1_score(instructions["new_target"], instructions["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return instructions \ No newline at end of file diff --git a/git b/git new file mode 100644 index 0000000000000000000000000000000000000000..e4d485ab38061be4384efe2164c674a70f363617 --- /dev/null +++ b/git @@ -0,0 +1,5 @@ +ps (cygwin) 2.5.0 +Show process statistics +Copyright (C) 1996 - 2016 Red Hat, Inc. +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. diff --git "a/logs/events.out.tfevents.1753875992.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.19780.7" "b/logs/events.out.tfevents.1753875992.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.19780.7" new file mode 100644 index 0000000000000000000000000000000000000000..aa1eee2b34858c9a519a20f69419d2fcd157b14c --- /dev/null +++ "b/logs/events.out.tfevents.1753875992.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.19780.7" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7c6cba9cafc083428acab1483336581c6201928345479091cde8311d1337e04 +size 15764 diff --git "a/logs/events.out.tfevents.1754049207.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.8068.1" "b/logs/events.out.tfevents.1754049207.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.8068.1" new file mode 100644 index 0000000000000000000000000000000000000000..b6240d7dc488b42d9e9e1af41b2bc380fd1e4e59 --- /dev/null +++ "b/logs/events.out.tfevents.1754049207.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.8068.1" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606a2c86c6cd768bd2a88c698994f29d334790e2c447d7fbfd2ac23e95e6b3b5 +size 6746 diff --git "a/logs/events.out.tfevents.1754050838.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.23260.1" "b/logs/events.out.tfevents.1754050838.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.23260.1" new file mode 100644 index 0000000000000000000000000000000000000000..aa3f5aa6413a5c1f73bfa77e47e312da29ed208f --- /dev/null +++ "b/logs/events.out.tfevents.1754050838.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.23260.1" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c9cebc90490e52af51a72119bcf5b40cda5e5e383950a4be64e382d78854fdf +size 5642 diff --git a/nwgi.py b/nwgi.py new file mode 100644 index 0000000000000000000000000000000000000000..9de327d340054d65ee7bab4d1874dd8e6d527a1e --- /dev/null +++ b/nwgi.py @@ -0,0 +1,83 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +dic = { + 'strong negative':"negative", + 'moderately negative':"negative", + 'mildly negative':"neutral", + 'strong positive':"positive", + 'moderately positive':"positive", + 'mildly positive':'neutral', + 'neutral':'neutral', +} + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_nwgi(model, tokenizer, batch_size = 8, prompt_fun = None ): + dataset = datasets.load_dataset('oliverwang15/news_with_gpt_instructions') + dataset = dataset['test'].to_pandas() + dataset['output'] = dataset['label'].apply(lambda x:dic[x]) + + if prompt_fun is None: + dataset["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}." + else: + dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) + dataset["input"] = dataset["news"] + + dataset = dataset[['input', 'output', 'instruction']] + dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") + + context = dataset['context'].tolist() + + total_steps = dataset.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + + out_text_list = [] + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + # tokens.pop('token_type_ids') + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + dataset["out_text"] = out_text_list + dataset["new_target"] = dataset["target"].apply(change_target) + dataset["new_out"] = dataset["out_text"].apply(change_target) + + acc = accuracy_score(dataset["new_target"], dataset["new_out"]) + f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") + f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") + f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return dataset diff --git "a/runs/Aug01_19-53-27_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1754049207.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.8068.0" "b/runs/Aug01_19-53-27_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1754049207.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.8068.0" new file mode 100644 index 0000000000000000000000000000000000000000..628e597113833ae63ef0fe162cf1871b246dc8b5 --- /dev/null +++ "b/runs/Aug01_19-53-27_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1754049207.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.8068.0" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16d914d19888a3998d6793b4e35aaa66f25c1da2d8eb84cf57ba4600b8740033 +size 6746 diff --git "a/runs/Aug01_20-20-38_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1754050838.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.23260.0" "b/runs/Aug01_20-20-38_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1754050838.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.23260.0" new file mode 100644 index 0000000000000000000000000000000000000000..e06c1377aecb4de82c1fe8f639ba9cb29a6e99fb --- /dev/null +++ "b/runs/Aug01_20-20-38_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1754050838.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.23260.0" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6570d7297be7bd8e451b6b7c7d72f65e7d2816ee2998b21a3f0085348644ee +size 5642 diff --git "a/runs/Jul30_19-46-32_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1753875992.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.19780.6" "b/runs/Jul30_19-46-32_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1753875992.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.19780.6" new file mode 100644 index 0000000000000000000000000000000000000000..747e635e489f7c4d06d6cba474a6e287a9b08eaf --- /dev/null +++ "b/runs/Jul30_19-46-32_\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206/events.out.tfevents.1753875992.\344\275\240\347\232\204\347\241\254\345\270\201\346\216\211\344\272\206.19780.6" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b478d3342bdda4372e67d8094a0a2cc9a3fab3781e347c59c3709d46d47662e +size 15764 diff --git a/tfns.py b/tfns.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8040194310d7483f01bcb2d917bb2fbc8f4485 --- /dev/null +++ b/tfns.py @@ -0,0 +1,79 @@ +import warnings +warnings.filterwarnings("ignore") + +from sklearn.metrics import accuracy_score,f1_score +from datasets import load_dataset +from tqdm import tqdm +import datasets +import torch + +dic = { + 0:"negative", + 1:'positive', + 2:'neutral', +} + +def format_example(example: dict) -> dict: + context = f"Instruction: {example['instruction']}\n" + if example.get("input"): + context += f"Input: {example['input']}\n" + context += "Answer: " + target = example["output"] + return {"context": context, "target": target} + +def change_target(x): + if 'positive' in x or 'Positive' in x: + return 'positive' + elif 'negative' in x or 'Negative' in x: + return 'negative' + else: + return 'neutral' + +def test_tfns(model, tokenizer, batch_size = 8, prompt_fun = None ): + dataset = load_dataset('zeroshot/twitter-financial-news-sentiment') + dataset = dataset['validation'] + dataset = dataset.to_pandas() + dataset['label'] = dataset['label'].apply(lambda x:dic[x]) + + if prompt_fun is None: + dataset["instruction"] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.' + else: + dataset["instruction"] = dataset.apply(prompt_fun, axis = 1) + + dataset.columns = ['input', 'output', 'instruction'] + dataset[["context","target"]] = dataset.apply(format_example, axis = 1, result_type="expand") + + # print example + print(f"\n\nPrompt example:\n{dataset['context'][0]}\n\n") + + context = dataset['context'].tolist() + + total_steps = dataset.shape[0]//batch_size + 1 + print(f"Total len: {len(context)}. Batchsize: {batch_size}. Total steps: {total_steps}") + + + out_text_list = [] + for i in tqdm(range(total_steps)): + tmp_context = context[i* batch_size:(i+1)* batch_size] + tokens = tokenizer(tmp_context, return_tensors='pt', padding=True, max_length=512) + # tokens.pop('token_type_ids') + for k in tokens.keys(): + tokens[k] = tokens[k].cuda() + res = model.generate(**tokens, max_length=512) + res_sentences = [tokenizer.decode(i) for i in res] + out_text = [o.split("Answer: ")[1] for o in res_sentences] + out_text_list += out_text + torch.cuda.empty_cache() + + dataset["out_text"] = out_text_list + dataset["new_target"] = dataset["target"].apply(change_target) + dataset["new_out"] = dataset["out_text"].apply(change_target) + + acc = accuracy_score(dataset["new_target"], dataset["new_out"]) + f1_macro = f1_score(dataset["new_target"], dataset["new_out"], average = "macro") + f1_micro = f1_score(dataset["new_target"], dataset["new_out"], average = "micro") + f1_weighted = f1_score(dataset["new_target"], dataset["new_out"], average = "weighted") + + print(f"Acc: {acc}. F1 macro: {f1_macro}. F1 micro: {f1_micro}. F1 weighted (BloombergGPT): {f1_weighted}. ") + + return dataset \ No newline at end of file